Compare commits

..

1 Commits

Author SHA1 Message Date
Aaron Watry
84ae235450 Initial OpenCL implementation of the VP8 decoder.
Change-Id: I74c334af09f13473ce07bbac74b0f9ea57573347
Note: very slow, but functional.  Encoder is untested, but should still work.
2011-04-18 13:50:23 -04:00
249 changed files with 15934 additions and 11868 deletions

6
.gitignore vendored
View File

@@ -60,3 +60,9 @@
/vpx_config.h
/vpx_version.h
TAGS
vpxdec
vpxenc
.project
.cproject
*.csv
*.oclpj

View File

@@ -2,4 +2,3 @@ Adrian Grange <agrange@google.com>
Johann Koenig <johannkoenig@google.com>
Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
Tom Finegan <tomfinegan@google.com>
Ralph Giles <giles@xiph.org> <giles@entropywave.com>

12
AUTHORS
View File

@@ -4,11 +4,8 @@
Aaron Watry <awatry@gmail.com>
Adrian Grange <agrange@google.com>
Alex Converse <alex.converse@gmail.com>
Alexis Ballier <aballier@gentoo.org>
Alok Ahuja <waveletcoeff@gmail.com>
Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
Aron Rosenberg <arosenberg@logitech.com>
Attila Nagy <attilanagy@google.com>
Fabio Pedretti <fabio.ped@libero.it>
Frank Galligan <fgalligan@google.com>
@@ -25,29 +22,20 @@ Jeff Muizelaar <jmuizelaar@mozilla.com>
Jim Bankoski <jimbankoski@google.com>
Johann Koenig <johannkoenig@google.com>
John Koleszar <jkoleszar@google.com>
Joshua Bleecher Snyder <josh@treelinelabs.com>
Justin Clift <justin@salasaga.org>
Justin Lebar <justin.lebar@gmail.com>
Lou Quillio <louquillio@google.com>
Luca Barbato <lu_zero@gentoo.org>
Makoto Kato <makoto.kt@gmail.com>
Martin Ettl <ettl.martin78@googlemail.com>
Michael Kohler <michaelkohler@live.com>
Mike Hommey <mhommey@mozilla.com>
Mikhal Shemer <mikhal@google.com>
Pascal Massimino <pascal.massimino@gmail.com>
Patrik Westin <patrik.westin@gmail.com>
Paul Wilkins <paulwilkins@google.com>
Pavol Rusnak <stick@gk2.sk>
Philip Jägenstedt <philipj@opera.com>
Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
Ralph Giles <giles@xiph.org>
Ronald S. Bultje <rbultje@google.com>
Scott LaVarnway <slavarnway@google.com>
Stefan Holmer <holmer@google.com>
Taekhyun Kim <takim@nvidia.com>
Tero Rintaluoma <teror@google.com>
Thijs Vermeir <thijsvermeir@gmail.com>
Timothy B. Terriberry <tterribe@xiph.org>
Tom Finegan <tomfinegan@google.com>
Yaowu Xu <yaowu@google.com>

View File

@@ -1,85 +1,3 @@
2011-08-02 v0.9.7 "Cayuga"
Our third named release, focused on a faster, higher quality, encoder.
- Upgrading:
This release is backwards compatible with Aylesbury (v0.9.5) and
Bali (v0.9.6). Users of older releases should refer to the Upgrading
notes in this document for that release.
- Enhancements:
Stereo 3D format support for vpxenc
Runtime detection of available processor cores.
Allow specifying --end-usage by enum name
vpxdec: test for frame corruption
vpxenc: add quantizer histogram display
vpxenc: add rate histogram display
Set VPX_FRAME_IS_DROPPABLE
update configure for ios sdk 4.3
Avoid text relocations in ARM vp8 decoder
Generate a vpx.pc file for pkg-config.
New ways of passing encoded data between encoder and decoder.
- Speed:
This release includes across-the-board speed improvements to the
encoder. On x86, these measure at approximately 11.5% in Best mode,
21.5% in Good mode (speed 0), and 22.5% in Realtime mode (speed 6).
On ARM Cortex A9 with Neon extensions, real-time encoding of video
telephony content is 35% faster than Bali on single core and 48%
faster on multi-core. On the NVidia Tegra2 platform, real time
encoding is 40% faster than Bali.
Decoder speed was not a priority for this release, but improved
approximately 8.4% on x86.
Reduce motion vector search on alt-ref frame.
Encoder loopfilter running in its own thread
Reworked loopfilter to precalculate more parameters
SSE2/SSSE3 optimizations for build_predictors_mbuv{,_s}().
Make hor UV predict ~2x faster (73 vs 132 cycles) using SSSE3.
Removed redundant checks
Reduced structure sizes
utilize preload in ARMv6 MC/LPF/Copy routines
ARM optimized quantization, dfct, variance, subtract
Increase chrow row alignment to 16 bytes.
disable trellis optimization for first pass
Write SSSE3 sub-pixel filter function
Improve SSE2 half-pixel filter funtions
Add vp8_sub_pixel_variance16x8_ssse3 function
Reduce unnecessary distortion computation
Use diamond search to replace full search
Preload reference area in sub-pixel motion search (real-time mode)
- Quality:
This release focused primarily on one-pass use cases, including
video conferencing. Low latency data rate control was significantly
improved, improving streamability over bandwidth constrained links.
Added support for error concealment, allowing frames to maintain
visual quality in the presence of substantial packet loss.
Add rc_max_intra_bitrate_pct control
Limit size of initial keyframe in one-pass.
Improve framerate adaptation
Improved 1-pass CBR rate control
Improved KF insertion after fades to still.
Improved key frame detection.
Improved activity masking (lower PSNR impact for same SSIM boost)
Improved interaction between GF and ARFs
Adding error-concealment to the decoder.
Adding support for independent partitions
Adjusted rate-distortion constants
- Bug Fixes:
Removed firstpass motion map
Fix parallel make install
Fix multithreaded encoding for 1 MB wide frame
Fixed iwalsh_neon build problems with RVDS4.1
Fix semaphore emulation, spin-wait intrinsics on Windows
Fix build with xcode4 and simplify GLOBAL.
Mark ARM asm objects as allowing a non-executable stack.
Fix vpxenc encoding incorrect webm file header on big endian
2011-03-07 v0.9.6 "Bali"
Our second named release, focused on a faster, higher quality, encoder.

View File

@@ -98,11 +98,11 @@ install::
$(BUILD_PFX)%.c.d: %.c
$(if $(quiet),@echo " [DEP] $@")
$(qexec)mkdir -p $(dir $@)
$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -M $< | $(fmt_deps) > $@
$(qexec)$(CC) $(CFLAGS) -M $< | $(fmt_deps) > $@
$(BUILD_PFX)%.c.o: %.c
$(if $(quiet),@echo " [CC] $@")
$(qexec)$(CC) $(INTERNAL_CFLAGS) $(CFLAGS) -c -o $@ $<
$(qexec)$(CC) $(CFLAGS) -c -o $@ $<
$(BUILD_PFX)%.asm.d: %.asm
$(if $(quiet),@echo " [DEP] $@")
@@ -124,12 +124,6 @@ $(BUILD_PFX)%.s.o: %.s
$(if $(quiet),@echo " [AS] $@")
$(qexec)$(AS) $(ASFLAGS) -o $@ $<
.PRECIOUS: %.c.S
%.c.S: CFLAGS += -DINLINE_ASM
$(BUILD_PFX)%.c.S: %.c
$(if $(quiet),@echo " [GEN] $@")
$(qexec)$(CC) -S $(CFLAGS) -o $@ $<
.PRECIOUS: %.asm.s
$(BUILD_PFX)%.asm.s: %.asm
$(if $(quiet),@echo " [ASM CONVERSION] $@")
@@ -194,7 +188,7 @@ define linker_template
$(1): $(filter-out -%,$(2))
$(1):
$(if $(quiet),@echo " [LD] $$@")
$(qexec)$$(LD) $$(strip $$(INTERNAL_LDFLAGS) $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs))
$(qexec)$$(LD) $$(strip $$(LDFLAGS) -o $$@ $(2) $(3) $$(extralibs))
endef
# make-3.80 has a bug with expanding large input strings to the eval function,
# which was triggered in some cases by the following component of
@@ -336,7 +330,6 @@ ifneq ($(call enabled,DIST-SRCS),)
DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_proj.sh
DIST-SRCS-$(CONFIG_MSVS) += build/make/gen_msvs_sln.sh
DIST-SRCS-$(CONFIG_MSVS) += build/x86-msvs/yasm.rules
DIST-SRCS-$(CONFIG_MSVS) += build/x86-msvs/obj_int_extract.bat
DIST-SRCS-$(CONFIG_RVCT) += build/make/armlink_adapter.sh
# Include obj_int_extract if we use offsets from asm_*_offsets
DIST-SRCS-$(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64) += build/make/obj_int_extract.c

View File

@@ -21,14 +21,8 @@ print "@ This file was created from a .asm file\n";
print "@ using the ads2gas.pl script.\n";
print "\t.equ DO1STROUNDING, 0\n";
# Stack of procedure names.
@proc_stack = ();
while (<STDIN>)
{
# Load and store alignment
s/@/,:/g;
# Comment character
s/;/@/g;
@@ -85,10 +79,7 @@ while (<STDIN>)
s/CODE([0-9][0-9])/.code $1/;
# No AREA required
# But ALIGNs in AREA must be obeyed
s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
# If no ALIGN, strip the AREA and align to 4 bytes
s/^\s*AREA.*$/.text\n.p2align 2/;
s/^\s*AREA.*$/.text/;
# DCD to .word
# This one is for incoming symbols
@@ -123,8 +114,8 @@ while (<STDIN>)
# put the colon at the end of the line in the macro
s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
# ALIGN directive
s/ALIGN/.balign/g;
# Strip ALIGN
s/\sALIGN/@ ALIGN/g;
# Strip ARM
s/\sARM/@ ARM/g;
@@ -136,23 +127,9 @@ while (<STDIN>)
# Strip PRESERVE8
s/\sPRESERVE8/@ PRESERVE8/g;
# Use PROC and ENDP to give the symbols a .size directive.
# This makes them show up properly in debugging tools like gdb and valgrind.
if (/\bPROC\b/)
{
my $proc;
/^_([\.0-9A-Z_a-z]\w+)\b/;
$proc = $1;
push(@proc_stack, $proc) if ($proc);
s/\bPROC\b/@ $&/;
}
if (/\bENDP\b/)
{
my $proc;
s/\bENDP\b/@ $&/;
$proc = pop(@proc_stack);
$_ = "\t.size $proc, .-$proc".$_ if ($proc);
}
# Strip PROC and ENDPROC
s/\sPROC/@/g;
s/\sENDP/@/g;
# EQU directive
s/(.*)EQU(.*)/.equ $1, $2/;
@@ -171,6 +148,3 @@ while (<STDIN>)
next if /^\s*END\s*$/;
print;
}
# Mark that this object doesn't need an executable stack.
printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n");

View File

@@ -41,9 +41,6 @@ sub trim($)
while (<STDIN>)
{
# Load and store alignment
s/@/,:/g;
# Comment character
s/;/@/g;
@@ -100,10 +97,7 @@ while (<STDIN>)
s/CODE([0-9][0-9])/.code $1/;
# No AREA required
# But ALIGNs in AREA must be obeyed
s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
# If no ALIGN, strip the AREA and align to 4 bytes
s/^\s*AREA.*$/.text\n.p2align 2/;
s/^\s*AREA.*$/.text/;
# DCD to .word
# This one is for incoming symbols
@@ -143,8 +137,8 @@ while (<STDIN>)
# put the colon at the end of the line in the macro
s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
# ALIGN directive
s/ALIGN/.balign/g;
# Strip ALIGN
s/\sALIGN/@ ALIGN/g;
# Strip ARM
s/\sARM/@ ARM/g;

View File

@@ -412,14 +412,11 @@ EOF
write_common_target_config_h() {
cat > ${TMP_H} << EOF
/* This file automatically generated by configure. Do not edit! */
#ifndef VPX_CONFIG_H
#define VPX_CONFIG_H
#define RESTRICT ${RESTRICT}
EOF
print_config_h ARCH "${TMP_H}" ${ARCH_LIST}
print_config_h HAVE "${TMP_H}" ${HAVE_LIST}
print_config_h CONFIG "${TMP_H}" ${CONFIG_LIST}
echo "#endif /* VPX_CONFIG_H */" >> ${TMP_H}
mkdir -p `dirname "$1"`
cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
}
@@ -629,7 +626,7 @@ process_common_toolchain() {
case ${toolchain} in
sparc-solaris-*)
add_extralibs -lposix4
disable fast_unaligned
add_cflags "-DMUST_BE_ALIGNED"
;;
*-solaris-*)
add_extralibs -lposix4
@@ -642,8 +639,8 @@ process_common_toolchain() {
# on arm, isa versions are supersets
enabled armv7a && soft_enable armv7 ### DEBUG
enabled armv7 && soft_enable armv6
enabled armv7 || enabled armv6 && soft_enable armv5te
enabled armv7 || enabled armv6 && soft_enable fast_unaligned
enabled armv6 && soft_enable armv5te
enabled armv6 && soft_enable fast_unaligned
enabled iwmmxt2 && soft_enable iwmmxt
enabled iwmmxt && soft_enable armv5te
@@ -692,7 +689,7 @@ process_common_toolchain() {
if enabled armv7
then
check_add_cflags --cpu=Cortex-A8 --fpu=softvfp+vfpv3
check_add_asflags --cpu=Cortex-A8 --fpu=softvfp+vfpv3
check_add_asflags --cpu=Cortex-A8 --fpu=none
else
check_add_cflags --cpu=${tgt_isa##armv}
check_add_asflags --cpu=${tgt_isa##armv}
@@ -754,24 +751,41 @@ process_common_toolchain() {
linux*)
enable linux
if enabled rvct; then
# Check if we have CodeSourcery GCC in PATH. Needed for
# libraries
hash arm-none-linux-gnueabi-gcc 2>&- || \
die "Couldn't find CodeSourcery GCC from PATH"
# Compiling with RVCT requires an alternate libc (glibc) when
# targetting linux.
disabled builtin_libc \
|| die "Must supply --libc when targetting *-linux-rvct"
# Use armcc as a linker to enable translation of
# some gcc specific options such as -lm and -lpthread.
LD="armcc --translate_gcc"
# Set up compiler
add_cflags --library_interface=aeabi_glibc
add_cflags --no_hide_all
add_cflags --dwarf2
# create configuration file (uses path to CodeSourcery GCC)
armcc --arm_linux_configure --arm_linux_config_file=arm_linux.cfg
# Set up linker
add_ldflags --sysv --no_startup --no_ref_cpp_init
add_ldflags --entry=_start
add_ldflags --keep '"*(.init)"' --keep '"*(.fini)"'
add_ldflags --keep '"*(.init_array)"' --keep '"*(.fini_array)"'
add_ldflags --dynamiclinker=/lib/ld-linux.so.3
add_extralibs libc.so.6 -lc_nonshared crt1.o crti.o crtn.o
add_cflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
add_asflags --no_hide_all --apcs=/interwork
add_ldflags --arm_linux_paths --arm_linux_config_file=arm_linux.cfg
enabled pic && add_cflags --apcs=/fpic
enabled pic && add_asflags --apcs=/fpic
enabled shared && add_cflags --shared
# Add the paths for the alternate libc
for d in usr/include; do
try_dir="${alt_libc}/${d}"
[ -d "${try_dir}" ] && add_cflags -J"${try_dir}"
done
add_cflags -J"${RVCT31INC}"
for d in lib usr/lib; do
try_dir="${alt_libc}/${d}"
[ -d "${try_dir}" ] && add_ldflags -L"${try_dir}"
done
# glibc has some struct members named __align, which is a
# storage modifier in RVCT. If we need to use this modifier,
# we'll have to #undef it in our code. Note that this must
# happen AFTER all libc inclues.
add_cflags -D__align=x_align_x
fi
;;
@@ -939,23 +953,47 @@ process_common_toolchain() {
enabled gcov &&
check_add_cflags -fprofile-arcs -ftest-coverage &&
check_add_ldflags -fprofile-arcs -ftest-coverage
if enabled optimizations; then
if enabled rvct; then
enabled small && check_add_cflags -Ospace || check_add_cflags -Otime
else
enabled rvct && check_add_cflags -Otime
enabled small && check_add_cflags -O2 || check_add_cflags -O3
fi
if enabled opencl; then
disable multithread
echo " disabling multithread"
soft_enable opencl #Provide output to make user comfortable
enable runtime_cpu_detect
#Use dlopen() to load OpenCL when possible.
case ${toolchain} in
*darwin10*)
check_add_cflags -D__APPLE__
add_extralibs -framework OpenCL
;;
*-win32-gcc)
if check_header dlfcn.h; then
add_extralibs -ldl
enable dlopen
else
#This shouldn't be a hard-coded path in the long term
add_extralibs -L/cygdrive/c/Windows/System32 -lOpenCL
fi
;;
*)
if check_header dlfcn.h; then
add_extralibs -ldl
enable dlopen
else
add_extralibs -lOpenCL
fi
;;
esac
fi
# Position Independent Code (PIC) support, for building relocatable
# shared objects
enabled gcc && enabled pic && check_add_cflags -fPIC
# Work around longjmp interception on glibc >= 2.11, to improve binary
# compatibility. See http://code.google.com/p/webm/issues/detail?id=166
enabled linux && check_add_cflags -D_FORTIFY_SOURCE=0
# Check for strip utility variant
${STRIP} -V 2>/dev/null | grep GNU >/dev/null && enable gnu_strip
@@ -974,9 +1012,6 @@ EOF
esac
fi
# for sysconf(3) and friends.
check_header unistd.h
# glibc needs these
if enabled linux; then
add_cflags -D_LARGEFILE_SOURCE

View File

@@ -365,7 +365,7 @@ generate_vcproj() {
DebugInformationFormat="1" \
Detect64BitPortabilityProblems="true" \
$uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="true"
$uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="1"
;;
*)
tag Tool \
@@ -379,7 +379,7 @@ generate_vcproj() {
DebugInformationFormat="1" \
Detect64BitPortabilityProblems="true" \
$uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="true"
$uses_asm && tag Tool Name="YASM" IncludePaths="$incs" Debug="1"
;;
esac
;;
@@ -447,8 +447,6 @@ generate_vcproj() {
obj_int_extract)
tag Tool \
Name="VCCLCompilerTool" \
Optimization="2" \
FavorSizeorSpeed="1" \
AdditionalIncludeDirectories="$incs" \
PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE" \
RuntimeLibrary="$release_runtime" \
@@ -464,8 +462,6 @@ generate_vcproj() {
tag Tool \
Name="VCCLCompilerTool" \
Optimization="2" \
FavorSizeorSpeed="1" \
AdditionalIncludeDirectories="$incs" \
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
RuntimeLibrary="$release_runtime" \
@@ -480,8 +476,6 @@ generate_vcproj() {
tag Tool \
Name="VCCLCompilerTool" \
AdditionalIncludeDirectories="$incs" \
Optimization="2" \
FavorSizeorSpeed="1" \
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
RuntimeLibrary="$release_runtime" \
UsePrecompiledHeader="0" \

21
configure vendored
View File

@@ -31,17 +31,16 @@ Advanced options:
${toggle_md5} support for output of checksum data
${toggle_static_msvcrt} use static MSVCRT (VS builds only)
${toggle_vp8} VP8 codec support
${toggle_internal_stats} output of encoder internal stats for debug, if supported (encoders)
${toggle_psnr} output of PSNR data, if supported (encoders)
${toggle_mem_tracker} track memory usage
${toggle_postproc} postprocessing
${toggle_multithread} multithreaded encoding and decoding.
${toggle_spatial_resampling} spatial sampling (scaling) support
${toggle_realtime_only} enable this option while building for real-time encoding
${toggle_error_concealment} enable this option to get a decoder which is able to conceal losses
${toggle_runtime_cpu_detect} runtime cpu detection
${toggle_shared} shared library support
${toggle_static} static library support
${toggle_small} favor smaller size over speed
${toggle_opencl} support for OpenCL-assisted VP8 decoding (experimental)
${toggle_postproc_visualizer} macro block / block level visualizers
Codecs:
@@ -107,6 +106,7 @@ all_platforms="${all_platforms} x86-darwin8-gcc"
all_platforms="${all_platforms} x86-darwin8-icc"
all_platforms="${all_platforms} x86-darwin9-gcc"
all_platforms="${all_platforms} x86-darwin9-icc"
all_platforms="${all_platforms} x86-darwin10-gcc"
all_platforms="${all_platforms} x86-linux-gcc"
all_platforms="${all_platforms} x86-linux-icc"
all_platforms="${all_platforms} x86-solaris-gcc"
@@ -154,7 +154,6 @@ enabled doxygen && php -v >/dev/null 2>&1 && enable install_docs
enable install_bins
enable install_libs
enable static
enable optimizations
enable fast_unaligned #allow unaligned accesses, if supported by hw
enable md5
@@ -214,7 +213,7 @@ HAVE_LIST="
alt_tree_layout
pthread_h
sys_mman_h
unistd_h
dlopen
"
CONFIG_LIST="
external_build
@@ -244,7 +243,7 @@ CONFIG_LIST="
runtime_cpu_detect
postproc
multithread
internal_stats
psnr
${CODECS}
${CODEC_FAMILIES}
encoders
@@ -252,10 +251,9 @@ CONFIG_LIST="
static_msvcrt
spatial_resampling
realtime_only
error_concealment
shared
static
small
opencl
postproc_visualizer
os_support
"
@@ -287,17 +285,16 @@ CMDLINE_SELECT="
dc_recon
postproc
multithread
internal_stats
psnr
${CODECS}
${CODEC_FAMILIES}
static_msvcrt
mem_tracker
spatial_resampling
realtime_only
error_concealment
shared
static
small
opencl
postproc_visualizer
"
@@ -564,4 +561,6 @@ process "$@"
cat <<EOF > ${BUILD_PFX}vpx_config.c
static const char* const cfg = "$CONFIGURE_ARGS";
const char *vpx_codec_build_config(void) {return cfg;}
static const char* const libdir = "$libdir";
const char *vpx_codec_lib_dir(void) {return libdir;}
EOF

View File

@@ -77,11 +77,6 @@ GEN_EXAMPLES-$(CONFIG_ENCODERS) += decode_with_drops.c
endif
decode_with_drops.GUID = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
decode_with_drops.DESCRIPTION = Drops frames while decoding
ifeq ($(CONFIG_DECODERS),yes)
GEN_EXAMPLES-$(CONFIG_ERROR_CONCEALMENT) += decode_with_partial_drops.c
endif
decode_with_partial_drops.GUID = 61C2D026-5754-46AC-916F-1343ECC5537E
decode_with_partial_drops.DESCRIPTION = Drops parts of frames while decoding
GEN_EXAMPLES-$(CONFIG_ENCODERS) += error_resilient.c
error_resilient.GUID = DF5837B9-4145-4F92-A031-44E4F832E00C
error_resilient.DESCRIPTION = Error Resiliency Feature
@@ -127,8 +122,8 @@ else
LIB_PATH := $(call enabled,LIB_PATH)
INC_PATH := $(call enabled,INC_PATH)
endif
INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH))
INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH))
CFLAGS += $(addprefix -I,$(INC_PATH))
LDFLAGS += $(addprefix -L,$(LIB_PATH))
# Expand list of selected examples to build (as specified above)
@@ -167,10 +162,8 @@ BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(ALL_EXAMPLES:.c=))
# Instantiate linker template for all examples.
CODEC_LIB=$(if $(CONFIG_DEBUG_LIBS),vpx_g,vpx)
CODEC_LIB_SUF=$(if $(CONFIG_SHARED),.so,.a)
$(foreach bin,$(BINS-yes),\
$(if $(BUILD_OBJS),$(eval $(bin):\
$(LIB_PATH)/lib$(CODEC_LIB)$(CODEC_LIB_SUF)))\
$(if $(BUILD_OBJS),$(eval $(bin): $(LIB_PATH)/lib$(CODEC_LIB).a))\
$(if $(BUILD_OBJS),$(eval $(call linker_template,$(bin),\
$(call objs,$($(notdir $(bin)).SRCS)) \
-l$(CODEC_LIB) $(addprefix -l,$(CODEC_EXTRA_LIBS))\
@@ -221,8 +214,7 @@ $(1): $($(1:.vcproj=).SRCS)
--ver=$$(CONFIG_VS_VERSION)\
--proj-guid=$$($$(@:.vcproj=).GUID)\
$$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \
--out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \
$$(INTERNAL_LDFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) -lwinmm $$^
--out=$$@ $$(CFLAGS) $$(LDFLAGS) -l$$(CODEC_LIB) -lwinmm $$^
endef
PROJECTS-$(CONFIG_MSVS) += $(ALL_EXAMPLES:.c=.vcproj)
INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\

View File

@@ -1,238 +0,0 @@
@TEMPLATE decoder_tmpl.c
Decode With Partial Drops Example
=========================
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
This is an example utility which drops a series of frames (or parts of frames),
as specified on the command line. This is useful for observing the error
recovery features of the codec.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTRODUCTION
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
#include <time.h>
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_INCLUDES
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
struct parsed_header
{
char key_frame;
int version;
char show_frame;
int first_part_size;
};
int next_packet(struct parsed_header* hdr, int pos, int length, int mtu)
{
int size = 0;
int remaining = length - pos;
/* Uncompressed part is 3 bytes for P frames and 10 bytes for I frames */
int uncomp_part_size = (hdr->key_frame ? 10 : 3);
/* number of bytes yet to send from header and the first partition */
int remainFirst = uncomp_part_size + hdr->first_part_size - pos;
if (remainFirst > 0)
{
if (remainFirst <= mtu)
{
size = remainFirst;
}
else
{
size = mtu;
}
return size;
}
/* second partition; just slot it up according to MTU */
if (remaining <= mtu)
{
size = remaining;
return size;
}
return mtu;
}
void throw_packets(unsigned char* frame, int* size, int loss_rate,
int* thrown, int* kept)
{
unsigned char loss_frame[256*1024];
int pkg_size = 1;
int pos = 0;
int loss_pos = 0;
struct parsed_header hdr;
unsigned int tmp;
int mtu = 1500;
if (*size < 3)
{
return;
}
putc('|', stdout);
/* parse uncompressed 3 bytes */
tmp = (frame[2] << 16) | (frame[1] << 8) | frame[0];
hdr.key_frame = !(tmp & 0x1); /* inverse logic */
hdr.version = (tmp >> 1) & 0x7;
hdr.show_frame = (tmp >> 4) & 0x1;
hdr.first_part_size = (tmp >> 5) & 0x7FFFF;
/* don't drop key frames */
if (hdr.key_frame)
{
int i;
*kept = *size/mtu + ((*size % mtu > 0) ? 1 : 0); /* approximate */
for (i=0; i < *kept; i++)
putc('.', stdout);
return;
}
while ((pkg_size = next_packet(&hdr, pos, *size, mtu)) > 0)
{
int loss_event = ((rand() + 1.0)/(RAND_MAX + 1.0) < loss_rate/100.0);
if (*thrown == 0 && !loss_event)
{
memcpy(loss_frame + loss_pos, frame + pos, pkg_size);
loss_pos += pkg_size;
(*kept)++;
putc('.', stdout);
}
else
{
(*thrown)++;
putc('X', stdout);
}
pos += pkg_size;
}
memcpy(frame, loss_frame, loss_pos);
memset(frame + loss_pos, 0, *size - loss_pos);
*size = loss_pos;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ HELPERS
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INIT
/* Initialize codec */
flags = VPX_CODEC_USE_ERROR_CONCEALMENT;
res = vpx_codec_dec_init(&codec, interface, &dec_cfg, flags);
if(res)
die_codec(&codec, "Failed to initialize decoder");
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEC_INIT
Usage
-----
This example adds a single argument to the `simple_decoder` example,
which specifies the range or pattern of frames to drop. The parameter is
parsed as follows:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
if(argc < 4 || argc > 6)
die("Usage: %s <infile> <outfile> [-t <num threads>] <N-M|N/M|L,S>\n",
argv[0]);
{
char *nptr;
int arg_num = 3;
if (argc == 6 && strncmp(argv[arg_num++], "-t", 2) == 0)
dec_cfg.threads = strtol(argv[arg_num++], NULL, 0);
n = strtol(argv[arg_num], &nptr, 0);
mode = (*nptr == '\0' || *nptr == ',') ? 2 : (*nptr == '-') ? 1 : 0;
m = strtol(nptr+1, NULL, 0);
if((!n && !m) || (*nptr != '-' && *nptr != '/' &&
*nptr != '\0' && *nptr != ','))
die("Couldn't parse pattern %s\n", argv[3]);
}
seed = (m > 0) ? m : (unsigned int)time(NULL);
srand(seed);thrown_frame = 0;
printf("Seed: %u\n", seed);
printf("Threads: %d\n", dec_cfg.threads);
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ USAGE
Dropping A Range Of Frames
--------------------------
To drop a range of frames, specify the starting frame and the ending
frame to drop, separated by a dash. The following command will drop
frames 5 through 10 (base 1).
$ ./decode_with_partial_drops in.ivf out.i420 5-10
Dropping A Pattern Of Frames
----------------------------
To drop a pattern of frames, specify the number of frames to drop and
the number of frames after which to repeat the pattern, separated by
a forward-slash. The following command will drop 3 of 7 frames.
Specifically, it will decode 4 frames, then drop 3 frames, and then
repeat.
$ ./decode_with_partial_drops in.ivf out.i420 3/7
Dropping Random Parts Of Frames
-------------------------------
A third argument tuple is available to split the frame into 1500 bytes pieces
and randomly drop pieces rather than frames. The frame will be split at
partition boundaries where possible. The following example will seed the RNG
with the seed 123 and drop approximately 5% of the pieces. Pieces which
are depending on an already dropped piece will also be dropped.
$ ./decode_with_partial_drops in.ivf out.i420 5,123
Extra Variables
---------------
This example maintains the pattern passed on the command line in the
`n`, `m`, and `is_range` variables:
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
int n, m, mode;
unsigned int seed;
int thrown=0, kept=0;
int thrown_frame=0, kept_frame=0;
vpx_codec_dec_cfg_t dec_cfg = {0};
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ EXTRA_VARS
Making The Drop Decision
------------------------
The example decides whether to drop the frame based on the current
frame number, immediately before decoding the frame.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE
/* Decide whether to throw parts of the frame or the whole frame
depending on the drop mode */
thrown_frame = 0;
kept_frame = 0;
switch (mode)
{
case 0:
if (m - (frame_cnt-1)%m <= n)
{
frame_sz = 0;
}
break;
case 1:
if (frame_cnt >= n && frame_cnt <= m)
{
frame_sz = 0;
}
break;
case 2:
throw_packets(frame, &frame_sz, n, &thrown_frame, &kept_frame);
break;
default: break;
}
if (mode < 2)
{
if (frame_sz == 0)
{
putc('X', stdout);
thrown_frame++;
}
else
{
putc('.', stdout);
kept_frame++;
}
}
thrown += thrown_frame;
kept += kept_frame;
fflush(stdout);
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ PRE_DECODE

View File

@@ -42,8 +42,6 @@ static void die(const char *fmt, ...) {
@DIE_CODEC
@HELPERS
int main(int argc, char **argv) {
FILE *infile, *outfile;
vpx_codec_ctx_t codec;

View File

@@ -111,6 +111,8 @@ int main(int argc, char **argv) {
vpx_codec_ctx_t codec;
vpx_codec_enc_cfg_t cfg;
int frame_cnt = 0;
unsigned char file_hdr[IVF_FILE_HDR_SZ];
unsigned char frame_hdr[IVF_FRAME_HDR_SZ];
vpx_image_t raw;
vpx_codec_err_t res;
long width;

View File

@@ -21,7 +21,7 @@ res = vpx_codec_dec_init(&codec, interface, NULL,
if(res == VPX_CODEC_INCAPABLE) {
printf("NOTICE: Postproc not supported by %s\n",
vpx_codec_iface_name(interface));
res = vpx_codec_dec_init(&codec, interface, NULL, flags);
res = vpx_codec_dec_init(&codec, interface, NULL, 0);
}
if(res)
die_codec(&codec, "Failed to initialize decoder");

View File

@@ -120,7 +120,7 @@ enum mkv
//video
Video = 0xE0,
FlagInterlaced = 0x9A,
StereoMode = 0x53B8,
// StereoMode = 0x53B8,
PixelWidth = 0xB0,
PixelHeight = 0xBA,
PixelCropBottom = 0x54AA,

View File

@@ -11,7 +11,6 @@
#include <stdlib.h>
#include <wchar.h>
#include <string.h>
#include <limits.h>
#if defined(_MSC_VER)
#define LITERALU64(n) n
#else
@@ -34,7 +33,7 @@ void Ebml_WriteLen(EbmlGlobal *glob, long long val)
val |= (LITERALU64(0x000000000000080) << ((size - 1) * 7));
Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
Ebml_Serialize(glob, (void *) &val, size);
}
void Ebml_WriteString(EbmlGlobal *glob, const char *str)
@@ -61,26 +60,21 @@ void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr)
void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id)
{
int len;
if (class_id >= 0x01000000)
len = 4;
Ebml_Serialize(glob, (void *)&class_id, 4);
else if (class_id >= 0x00010000)
len = 3;
Ebml_Serialize(glob, (void *)&class_id, 3);
else if (class_id >= 0x00000100)
len = 2;
Ebml_Serialize(glob, (void *)&class_id, 2);
else
len = 1;
Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
Ebml_Serialize(glob, (void *)&class_id, 1);
}
void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui)
{
unsigned char sizeSerialized = 8 | 0x80;
Ebml_WriteID(glob, class_id);
Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
Ebml_Serialize(glob, &ui, sizeof(ui), 8);
Ebml_Serialize(glob, &sizeSerialized, 1);
Ebml_Serialize(glob, &ui, 8);
}
void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui)
@@ -103,8 +97,8 @@ void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned l
}
sizeSerialized = 0x80 | size;
Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
Ebml_Serialize(glob, &ui, sizeof(ui), size);
Ebml_Serialize(glob, &sizeSerialized, 1);
Ebml_Serialize(glob, &ui, size);
}
//TODO: perhaps this is a poor name for this id serializer helper function
void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin)
@@ -125,14 +119,14 @@ void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d)
unsigned char len = 0x88;
Ebml_WriteID(glob, class_id);
Ebml_Serialize(glob, &len, sizeof(len), 1);
Ebml_Serialize(glob, &d, sizeof(d), 8);
Ebml_Serialize(glob, &len, 1);
Ebml_Serialize(glob, &d, 8);
}
void Ebml_WriteSigned16(EbmlGlobal *glob, short val)
{
signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
Ebml_Serialize(glob, &out, sizeof(out), 3);
Ebml_Serialize(glob, &out, 3);
}
void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s)
@@ -149,6 +143,7 @@ void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s)
void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length)
{
unsigned char size = 4;
Ebml_WriteID(glob, class_id);
Ebml_WriteLen(glob, data_length);
Ebml_Write(glob, data, data_length);

View File

@@ -15,7 +15,7 @@
#include "vpx/vpx_integer.h"
typedef struct EbmlGlobal EbmlGlobal;
void Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
void Ebml_Serialize(EbmlGlobal *glob, const void *, unsigned long);
void Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
/////

View File

@@ -35,11 +35,11 @@ void writeSimpleBlock(EbmlGlobal *glob, unsigned char trackNumber, short timeCod
Ebml_WriteID(glob, SimpleBlock);
unsigned long blockLength = 4 + dataLength;
blockLength |= 0x10000000; //TODO check length < 0x0FFFFFFFF
Ebml_Serialize(glob, &blockLength, sizeof(blockLength), 4);
Ebml_Serialize(glob, &blockLength, 4);
trackNumber |= 0x80; //TODO check track nubmer < 128
Ebml_Write(glob, &trackNumber, 1);
//Ebml_WriteSigned16(glob, timeCode,2); //this is 3 bytes
Ebml_Serialize(glob, &timeCode, sizeof(timeCode), 2);
Ebml_Serialize(glob, &timeCode, 2);
unsigned char flags = 0x00 | (isKeyframe ? 0x80 : 0x00) | (lacingFlag << 1) | discardable;
Ebml_Write(glob, &flags, 1);
Ebml_Write(glob, data, dataLength);

69
libs.mk
View File

@@ -35,7 +35,6 @@ ifeq ($(CONFIG_VP8_ENCODER),yes)
CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_CX_EXPORTS))
CODEC_SRCS-yes += $(VP8_PREFIX)vp8cx.mk vpx/vp8.h vpx/vp8cx.h vpx/vp8e.h
CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8cx_arm.mk
INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8e.h include/vpx/vp8cx.h
INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
@@ -48,7 +47,6 @@ ifeq ($(CONFIG_VP8_DECODER),yes)
CODEC_SRCS-yes += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_DX_SRCS))
CODEC_EXPORTS-yes += $(addprefix $(VP8_PREFIX),$(VP8_DX_EXPORTS))
CODEC_SRCS-yes += $(VP8_PREFIX)vp8dx.mk vpx/vp8.h vpx/vp8dx.h
CODEC_SRCS-$(ARCH_ARM) += $(VP8_PREFIX)vp8dx_arm.mk
INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8dx.h
INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP8_PREFIX)/%
CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8dx.h
@@ -91,7 +89,6 @@ $(eval $(if $(filter universal%,$(TOOLCHAIN)),LIPO_LIBVPX,BUILD_LIBVPX):=yes)
CODEC_SRCS-$(BUILD_LIBVPX) += build/make/version.sh
CODEC_SRCS-$(BUILD_LIBVPX) += vpx/vpx_integer.h
CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/asm_offsets.h
CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/vpx_timer.h
CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/mem.h
CODEC_SRCS-$(BUILD_LIBVPX) += $(BUILD_PFX)vpx_config.c
@@ -103,7 +100,7 @@ CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_abi_support.asm
CODEC_SRCS-$(BUILD_LIBVPX) += vpx_ports/x86_cpuid.c
endif
CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm_cpudetect.c
CODEC_SRCS-$(ARCH_ARM) += vpx_ports/arm.h
CODEC_SRCS-$(ARCH_ARM) += $(BUILD_PFX)vpx_config.asm
CODEC_EXPORTS-$(BUILD_LIBVPX) += vpx/exports_com
CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
@@ -124,8 +121,20 @@ INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/v
INSTALL-LIBS-$(CONFIG_SHARED) += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/vpx.exp)
endif
else
INSTALL-LIBS-$(CONFIG_STATIC) += $(LIBSUBDIR)/libvpx.a
INSTALL-LIBS-yes += $(LIBSUBDIR)/libvpx.a
INSTALL-LIBS-$(CONFIG_DEBUG_LIBS) += $(LIBSUBDIR)/libvpx_g.a
#Install the OpenCL kernels if CL enabled.
ifeq ($(CONFIG_OPENCL),yes)
INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/filter_cl.cl
INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/idctllm_cl.cl
INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/common/opencl/loopfilter.cl
#only install decoder CL files if VP8 decoder enabled
ifeq ($(CONFIG_VP8_DECODER),yes)
INSTALL-LIBS-yes += $(LIBSUBDIR)/vp8/decoder/opencl/dequantize_cl.cl
endif
endif #CONFIG_OPENCL=yes
endif
CODEC_SRCS=$(call enabled,CODEC_SRCS)
@@ -180,15 +189,14 @@ endif
else
LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
OBJS-$(BUILD_LIBVPX) += $(LIBVPX_OBJS)
LIBS-$(if $(BUILD_LIBVPX),$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
LIBS-$(BUILD_LIBVPX) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
$(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
BUILD_LIBVPX_SO := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))
LIBVPX_SO := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\
$(notdir $(LIBVPX_SO_SYMLINKS))
LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)
$(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) libvpx.ver
$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
$(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm -pthread
$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR)
$(BUILD_PFX)$(LIBVPX_SO): SO_VERSION_SCRIPT = libvpx.ver
LIBVPX_SO_SYMLINKS := $(addprefix $(LIBSUBDIR)/, \
@@ -202,18 +210,9 @@ libvpx.ver: $(call enabled,CODEC_EXPORTS)
$(qexec)echo "local: *; };" >> $@
CLEAN-OBJS += libvpx.ver
define libvpx_symlink_template
$(1): $(2)
@echo " [LN] $$@"
$(qexec)ln -sf $(LIBVPX_SO) $$@
endef
$(eval $(call libvpx_symlink_template,\
$(addprefix $(BUILD_PFX),$(notdir $(LIBVPX_SO_SYMLINKS))),\
$(BUILD_PFX)$(LIBVPX_SO)))
$(eval $(call libvpx_symlink_template,\
$(addprefix $(DIST_DIR)/,$(LIBVPX_SO_SYMLINKS)),\
$(DIST_DIR)/$(LIBSUBDIR)/$(LIBVPX_SO)))
$(addprefix $(DIST_DIR)/,$(LIBVPX_SO_SYMLINKS)):
@echo " [LN] $@"
$(qexec)ln -sf $(LIBVPX_SO) $@
INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBVPX_SO_SYMLINKS)
INSTALL-LIBS-$(CONFIG_SHARED) += $(LIBSUBDIR)/$(LIBVPX_SO)
@@ -270,38 +269,29 @@ $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm
#
# Calculate platform- and compiler-specific offsets for hand coded assembly
#
ifeq ($(filter icc gcc,$(TGT_CC)), $(TGT_CC))
$(BUILD_PFX)asm_com_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S: $(VP8_PREFIX)common/asm_com_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_com_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)common/asm_com_offsets.c.S
$(BUILD_PFX)asm_enc_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S: $(VP8_PREFIX)encoder/asm_enc_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_enc_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)encoder/asm_enc_offsets.c.S
$(BUILD_PFX)asm_dec_offsets.asm: $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
grep EQU $< | tr -d '$$\#' $(ADS2GAS) > $@
$(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S: $(VP8_PREFIX)decoder/asm_dec_offsets.c
CLEAN-OBJS += $(BUILD_PFX)asm_dec_offsets.asm $(BUILD_PFX)$(VP8_PREFIX)decoder/asm_dec_offsets.c.S
else
ifeq ($(filter rvct,$(TGT_CC)), $(TGT_CC))
ifeq ($(CONFIG_EXTERNAL_BUILD),) # Visual Studio uses obj_int_extract.bat
ifeq ($(ARCH_ARM), yes)
asm_com_offsets.asm: obj_int_extract
asm_com_offsets.asm: $(VP8_PREFIX)common/asm_com_offsets.c.o
./obj_int_extract rvds $< $(ADS2GAS) > $@
OBJS-yes += $(VP8_PREFIX)common/asm_com_offsets.c.o
CLEAN-OBJS += asm_com_offsets.asm
$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_com_offsets.asm
endif
ifeq ($(ARCH_ARM)$(ARCH_X86)$(ARCH_X86_64), yes)
ifeq ($(CONFIG_VP8_ENCODER), yes)
asm_enc_offsets.asm: obj_int_extract
asm_enc_offsets.asm: $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
./obj_int_extract rvds $< $(ADS2GAS) > $@
OBJS-yes += $(VP8_PREFIX)encoder/asm_enc_offsets.c.o
CLEAN-OBJS += asm_enc_offsets.asm
$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_enc_offsets.asm
endif
endif
ifeq ($(ARCH_ARM), yes)
ifeq ($(CONFIG_VP8_DECODER), yes)
asm_dec_offsets.asm: obj_int_extract
asm_dec_offsets.asm: $(VP8_PREFIX)decoder/asm_dec_offsets.c.o
./obj_int_extract rvds $< $(ADS2GAS) > $@
@@ -310,6 +300,7 @@ else
$(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)asm_dec_offsets.asm
endif
endif
endif
$(shell $(SRC_PATH_BARE)/build/make/version.sh "$(SRC_PATH_BARE)" $(BUILD_PFX)vpx_version.h)
CLEAN-OBJS += $(BUILD_PFX)vpx_version.h

View File

@@ -27,9 +27,6 @@ static void update_mode_info_border(MODE_INFO *mi, int rows, int cols)
for (i = 0; i < rows; i++)
{
/* TODO(holmer): Bug? This updates the last element of each row
* rather than the border element!
*/
vpx_memset(&mi[i*cols-1], 0, sizeof(MODE_INFO));
}
}
@@ -46,11 +43,9 @@ void vp8_de_alloc_frame_buffers(VP8_COMMON *oci)
vpx_free(oci->above_context);
vpx_free(oci->mip);
vpx_free(oci->prev_mip);
oci->above_context = 0;
oci->mip = 0;
oci->prev_mip = 0;
}
@@ -70,8 +65,8 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
for (i = 0; i < NUM_YV12_BUFFERS; i++)
{
oci->fb_idx_ref_cnt[i] = 0;
oci->yv12_fb[i].flags = 0;
oci->fb_idx_ref_cnt[0] = 0;
if (vp8_yv12_alloc_frame_buffer(&oci->yv12_fb[i], width, height, VP8BORDERINPIXELS) < 0)
{
vp8_de_alloc_frame_buffers(oci);
@@ -115,21 +110,6 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
oci->mi = oci->mip + oci->mode_info_stride + 1;
/* allocate memory for last frame MODE_INFO array */
#if CONFIG_ERROR_CONCEALMENT
oci->prev_mip = vpx_calloc((oci->mb_cols + 1) * (oci->mb_rows + 1), sizeof(MODE_INFO));
if (!oci->prev_mip)
{
vp8_de_alloc_frame_buffers(oci);
return 1;
}
oci->prev_mi = oci->prev_mip + oci->mode_info_stride + 1;
#else
oci->prev_mip = NULL;
oci->prev_mi = NULL;
#endif
oci->above_context = vpx_calloc(sizeof(ENTROPY_CONTEXT_PLANES) * oci->mb_cols, 1);
@@ -140,9 +120,6 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height)
}
update_mode_info_border(oci->mi, oci->mb_rows, oci->mb_cols);
#if CONFIG_ERROR_CONCEALMENT
update_mode_info_border(oci->prev_mi, oci->mb_rows, oci->mb_cols);
#endif
return 0;
}
@@ -152,33 +129,33 @@ void vp8_setup_version(VP8_COMMON *cm)
{
case 0:
cm->no_lpf = 0;
cm->filter_type = NORMAL_LOOPFILTER;
cm->use_bilinear_mc_filter = 0;
cm->simpler_lpf = 0;
cm->mcomp_filter_type = SIXTAP;
cm->full_pixel = 0;
break;
case 1:
cm->no_lpf = 0;
cm->filter_type = SIMPLE_LOOPFILTER;
cm->use_bilinear_mc_filter = 1;
cm->simpler_lpf = 1;
cm->mcomp_filter_type = BILINEAR;
cm->full_pixel = 0;
break;
case 2:
cm->no_lpf = 1;
cm->filter_type = NORMAL_LOOPFILTER;
cm->use_bilinear_mc_filter = 1;
cm->simpler_lpf = 0;
cm->mcomp_filter_type = BILINEAR;
cm->full_pixel = 0;
break;
case 3:
cm->no_lpf = 1;
cm->filter_type = SIMPLE_LOOPFILTER;
cm->use_bilinear_mc_filter = 1;
cm->simpler_lpf = 1;
cm->mcomp_filter_type = BILINEAR;
cm->full_pixel = 1;
break;
default:
/*4,5,6,7 are reserved for future use*/
cm->no_lpf = 0;
cm->filter_type = NORMAL_LOOPFILTER;
cm->use_bilinear_mc_filter = 0;
cm->simpler_lpf = 0;
cm->mcomp_filter_type = SIXTAP;
cm->full_pixel = 0;
break;
}
@@ -192,8 +169,8 @@ void vp8_create_common(VP8_COMMON *oci)
oci->mb_no_coeff_skip = 1;
oci->no_lpf = 0;
oci->filter_type = NORMAL_LOOPFILTER;
oci->use_bilinear_mc_filter = 0;
oci->simpler_lpf = 0;
oci->mcomp_filter_type = SIXTAP;
oci->full_pixel = 0;
oci->multi_token_partition = ONE_PARTITION;
oci->clr_type = REG_YUV;

View File

@@ -24,17 +24,14 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
#if CONFIG_RUNTIME_CPU_DETECT
VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
int flags = arm_cpu_caps();
int has_edsp = flags & HAS_EDSP;
int has_media = flags & HAS_MEDIA;
int has_neon = flags & HAS_NEON;
rtcd->flags = flags;
/* Override default functions with fastest ones for this CPU. */
#if HAVE_ARMV5TE
if (flags & HAS_EDSP)
{
}
#endif
#if HAVE_ARMV6
if (flags & HAS_MEDIA)
if (has_media)
{
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_armv6;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_armv6;
@@ -54,11 +51,9 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6;
rtcd->loopfilter.simple_mb_v =
vp8_loop_filter_simple_vertical_edge_armv6;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6;
rtcd->loopfilter.simple_mb_h =
vp8_loop_filter_simple_horizontal_edge_armv6;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;
@@ -71,7 +66,7 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
#endif
#if HAVE_ARMV7
if (flags & HAS_NEON)
if (has_neon)
{
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_neon;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_neon;

View File

@@ -30,12 +30,12 @@
ldr r4, [sp, #36] ; width
mov r12, r3 ; outer-loop counter
add r7, r2, r4 ; preload next row
pld [r0, r7]
sub r2, r2, r4 ; src increment for height loop
;;IF ARCHITECTURE=6
pld [r0]
;;ENDIF
ldr r5, [r11] ; load up filter coefficients
mov r3, r3, lsl #1 ; height*2
@@ -96,8 +96,9 @@
add r0, r0, r2 ; move to next input row
subs r12, r12, #1
add r9, r2, r4, lsl #1 ; adding back block width
pld [r0, r9] ; preload next row
;;IF ARCHITECTURE=6
pld [r0]
;;ENDIF
add r11, r11, #2 ; move over to next column
mov r1, r11

View File

@@ -22,7 +22,9 @@
;push {r4-r7}
;preload
pld [r0, #31] ; preload for next 16x16 block
pld [r0]
pld [r0, r1]
pld [r0, r1, lsl #1]
ands r4, r0, #15
beq copy_mem16x16_fast
@@ -88,8 +90,6 @@ copy_mem16x16_1_loop
ldrneb r6, [r0, #2]
ldrneb r7, [r0, #3]
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_1_loop
ldmia sp!, {r4 - r7}
@@ -121,8 +121,6 @@ copy_mem16x16_4_loop
ldrne r6, [r0, #8]
ldrne r7, [r0, #12]
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_4_loop
ldmia sp!, {r4 - r7}
@@ -150,7 +148,6 @@ copy_mem16x16_8_loop
add r2, r2, r3
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_8_loop
ldmia sp!, {r4 - r7}
@@ -174,7 +171,6 @@ copy_mem16x16_fast_loop
;stm r2, {r4-r7}
add r2, r2, r3
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_fast_loop
ldmia sp!, {r4 - r7}

View File

@@ -10,8 +10,6 @@
EXPORT |vp8_filter_block2d_first_pass_armv6|
EXPORT |vp8_filter_block2d_first_pass_16x16_armv6|
EXPORT |vp8_filter_block2d_first_pass_8x8_armv6|
EXPORT |vp8_filter_block2d_second_pass_armv6|
EXPORT |vp8_filter4_block2d_second_pass_armv6|
EXPORT |vp8_filter_block2d_first_pass_only_armv6|
@@ -42,6 +40,11 @@
add r12, r3, #16 ; square off the output
sub sp, sp, #4
;;IF ARCHITECTURE=6
;pld [r0, #-2]
;;pld [r0, #30]
;;ENDIF
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
@@ -98,10 +101,15 @@
bne width_loop_1st_6
;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
;;IF ARCHITECTURE=6
;pld [r0, r2]
;;pld [r0, r9]
;;ENDIF
ldr r1, [sp] ; load and update dst address
subs r7, r7, #0x10000
add r0, r0, r2 ; move to next input line
add r1, r1, #2 ; move over to next column
str r1, [sp]
@@ -112,192 +120,6 @@
ENDP
; --------------------------
; 16x16 version
; -----------------------------
|vp8_filter_block2d_first_pass_16x16_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp8_filter address
ldr r7, [sp, #36] ; output height
add r4, r2, #18 ; preload next low
pld [r0, r4]
sub r2, r2, r3 ; inside loop increments input array,
; so the height loop only needs to add
; r2 - width to the input pointer
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
add r12, r3, #16 ; square off the output
sub sp, sp, #4
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
str r1, [sp] ; push destination to stack
mov r7, r7, lsl #16 ; height is top part of counter
; six tap filter
|height_loop_1st_16_6|
ldrb r8, [r0, #-2] ; load source data
ldrb r9, [r0, #-1]
ldrb r10, [r0], #2
orr r7, r7, r3, lsr #2 ; construct loop counter
|width_loop_1st_16_6|
ldrb r11, [r0, #-1]
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0]
smuad lr, lr, r4 ; apply the filter
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smuad r8, r8, r4
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0, #1]
smlad r8, r11, r5, r8
ldrb r11, [r0, #2]
sub r7, r7, #1
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r11, r10, r6, r8
ands r10, r7, #0xff ; test loop counter
add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0, #-2] ; load data for next loop
usat lr, #8, lr, asr #7
add r11, r11, #0x40
ldrneb r9, [r0, #-1]
usat r11, #8, r11, asr #7
strh lr, [r1], r12 ; result is transposed and stored, which
; will make second pass filtering easier.
ldrneb r10, [r0], #2
strh r11, [r1], r12
bne width_loop_1st_16_6
ldr r1, [sp] ; load and update dst address
subs r7, r7, #0x10000
add r0, r0, r2 ; move to next input line
add r11, r2, #34 ; adding back block width(=16)
pld [r0, r11] ; preload next low
add r1, r1, #2 ; move over to next column
str r1, [sp]
bne height_loop_1st_16_6
add sp, sp, #4
ldmia sp!, {r4 - r11, pc}
ENDP
; --------------------------
; 8x8 version
; -----------------------------
|vp8_filter_block2d_first_pass_8x8_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #40] ; vp8_filter address
ldr r7, [sp, #36] ; output height
add r4, r2, #10 ; preload next low
pld [r0, r4]
sub r2, r2, r3 ; inside loop increments input array,
; so the height loop only needs to add
; r2 - width to the input pointer
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
add r12, r3, #16 ; square off the output
sub sp, sp, #4
ldr r4, [r11] ; load up packed filter coefficients
ldr r5, [r11, #4]
ldr r6, [r11, #8]
str r1, [sp] ; push destination to stack
mov r7, r7, lsl #16 ; height is top part of counter
; six tap filter
|height_loop_1st_8_6|
ldrb r8, [r0, #-2] ; load source data
ldrb r9, [r0, #-1]
ldrb r10, [r0], #2
orr r7, r7, r3, lsr #2 ; construct loop counter
|width_loop_1st_8_6|
ldrb r11, [r0, #-1]
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
ldrb r9, [r0]
smuad lr, lr, r4 ; apply the filter
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smuad r8, r8, r4
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
smlad lr, r10, r5, lr
ldrb r10, [r0, #1]
smlad r8, r11, r5, r8
ldrb r11, [r0, #2]
sub r7, r7, #1
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
smlad lr, r9, r6, lr
smlad r11, r10, r6, r8
ands r10, r7, #0xff ; test loop counter
add lr, lr, #0x40 ; round_shift_and_clamp
ldrneb r8, [r0, #-2] ; load data for next loop
usat lr, #8, lr, asr #7
add r11, r11, #0x40
ldrneb r9, [r0, #-1]
usat r11, #8, r11, asr #7
strh lr, [r1], r12 ; result is transposed and stored, which
; will make second pass filtering easier.
ldrneb r10, [r0], #2
strh r11, [r1], r12
bne width_loop_1st_8_6
ldr r1, [sp] ; load and update dst address
subs r7, r7, #0x10000
add r0, r0, r2 ; move to next input line
add r11, r2, #18 ; adding back block width(=8)
pld [r0, r11] ; preload next low
add r1, r1, #2 ; move over to next column
str r1, [sp]
bne height_loop_1st_8_6
add sp, sp, #4
ldmia sp!, {r4 - r11, pc}
ENDP
;---------------------------------
; r0 short *src_ptr,
; r1 unsigned char *output_ptr,
@@ -440,10 +262,6 @@
|vp8_filter_block2d_first_pass_only_armv6| PROC
stmdb sp!, {r4 - r11, lr}
add r7, r2, r3 ; preload next low
add r7, r7, #2
pld [r0, r7]
ldr r4, [sp, #36] ; output pitch
ldr r11, [sp, #40] ; HFilter address
sub sp, sp, #8
@@ -512,15 +330,16 @@
bne width_loop_1st_only_6
;;add r9, r2, #30 ; attempt to load 2 adjacent cache lines
;;IF ARCHITECTURE=6
;pld [r0, r2]
;;pld [r0, r9]
;;ENDIF
ldr lr, [sp] ; load back output pitch
ldr r12, [sp, #4] ; load back output pitch
subs r7, r7, #1
add r0, r0, r12 ; updata src for next loop
add r11, r12, r3 ; preload next low
add r11, r11, #2
pld [r0, r11]
add r1, r1, lr ; update dst for next loop
bne height_loop_1st_only_6

View File

@@ -53,11 +53,14 @@ count RN r5
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
;r2 const char *blimit,
;r2 const char *flimit,
;r3 const char *limit,
;stack const char *thresh,
;stack int count
;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
@@ -69,18 +72,14 @@ count RN r5
sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3
ldrb r4, [r2] ; blimit
ldr r4, [r2], #4 ; flimit
ldr r10, [src], pstep ; p2
ldrb r2, [r3] ; limit
ldr r2, [r3], #4 ; limit
ldr r11, [src], pstep ; p1
orr r4, r4, r4, lsl #8
ldrb r3, [r6] ; thresh
orr r2, r2, r2, lsl #8
uadd8 r4, r4, r4 ; flimit * 2
ldr r3, [r6], #4 ; thresh
mov count, count, lsl #1 ; 4-in-parallel
orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
uadd8 r4, r4, r2 ; flimit * 2 + limit
|Hnext8|
; vp8_filter_mask() function
@@ -254,6 +253,12 @@ count RN r5
subs count, count, #1
;pld [src]
;pld [src, pstep]
;pld [src, pstep, lsl #1]
;pld [src, pstep, lsl #2]
;pld [src, pstep, lsl #3]
ldrne r9, [src], pstep ; p3
ldrne r10, [src], pstep ; p2
ldrne r11, [src], pstep ; p1
@@ -276,18 +281,14 @@ count RN r5
sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3
ldrb r4, [r2] ; blimit
ldr r4, [r2], #4 ; flimit
ldr r10, [src], pstep ; p2
ldrb r2, [r3] ; limit
ldr r2, [r3], #4 ; limit
ldr r11, [src], pstep ; p1
orr r4, r4, r4, lsl #8
ldrb r3, [r6] ; thresh
orr r2, r2, r2, lsl #8
uadd8 r4, r4, r4 ; flimit * 2
ldr r3, [r6], #4 ; thresh
mov count, count, lsl #1 ; 4-in-parallel
orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
uadd8 r4, r4, r2 ; flimit * 2 + limit
|MBHnext8|
@@ -589,19 +590,15 @@ count RN r5
sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data
ldrb r4, [r2] ; blimit
ldr r4, [r2], #4 ; flimit
ldr r7, [src], pstep
ldrb r2, [r3] ; limit
ldr r2, [r3], #4 ; limit
ldr r8, [src], pstep
orr r4, r4, r4, lsl #8
ldrb r3, [r12] ; thresh
orr r2, r2, r2, lsl #8
uadd8 r4, r4, r4 ; flimit * 2
ldr r3, [r12], #4 ; thresh
ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel
orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
uadd8 r4, r4, r2 ; flimit * 2 + limit
|Vnext8|
@@ -860,26 +857,18 @@ count RN r5
sub src, src, #4 ; move src pointer down by 4
ldr count, [sp, #40] ; count for 8-in-parallel
ldr r12, [sp, #36] ; load thresh address
pld [src, #23] ; preload for next block
sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data
ldrb r4, [r2] ; blimit
pld [src, #23]
ldr r4, [r2], #4 ; flimit
ldr r7, [src], pstep
ldrb r2, [r3] ; limit
pld [src, #23]
ldr r2, [r3], #4 ; limit
ldr r8, [src], pstep
orr r4, r4, r4, lsl #8
ldrb r3, [r12] ; thresh
orr r2, r2, r2, lsl #8
pld [src, #23]
uadd8 r4, r4, r4 ; flimit * 2
ldr r3, [r12], #4 ; thresh
ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel
orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
uadd8 r4, r4, r2 ; flimit * 2 + limit
|MBVnext8|
; vp8_filter_mask() function
@@ -919,7 +908,6 @@ count RN r5
str lr, [sp, #8]
ldr lr, [src], pstep
TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
ldr lr, [sp, #8] ; load back (f)limit accumulator
@@ -968,7 +956,6 @@ count RN r5
beq mbvskip_filter ; skip filtering
;vp8_hevmask() function
;calculate high edge variance
@@ -1136,7 +1123,6 @@ count RN r5
smlabb r8, r6, lr, r7
smlatb r6, r6, lr, r7
smlabb r9, r10, lr, r7
smlatb r10, r10, lr, r7
ssat r8, #8, r8, asr #7
ssat r6, #8, r6, asr #7
@@ -1256,13 +1242,9 @@ count RN r5
sub src, src, #4
subs count, count, #1
pld [src, #23] ; preload for next block
ldrne r6, [src], pstep ; load source data
pld [src, #23]
ldrne r7, [src], pstep
pld [src, #23]
ldrne r8, [src], pstep
pld [src, #23]
ldrne lr, [src], pstep
bne MBVnext8

View File

@@ -45,28 +45,35 @@
MEND
src RN r0
pstep RN r1
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
;r2 const char *blimit
;r2 const char *flimit,
;r3 const char *limit,
;stack const char *thresh,
;stack int count
; All 16 elements in flimit are equal. So, in the code, only one load is needed
; for flimit. Same applies to limit. thresh is not used in simple looopfilter
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
ldrb r12, [r2] ; blimit
ldr r12, [r3] ; limit
ldr r3, [src, -pstep, lsl #1] ; p1
ldr r4, [src, -pstep] ; p0
ldr r5, [src] ; q0
ldr r6, [src, pstep] ; q1
orr r12, r12, r12, lsl #8 ; blimit
ldr r7, [r2] ; flimit
ldr r2, c0x80808080
orr r12, r12, r12, lsl #16 ; blimit
mov r9, #4 ; double the count. we're doing 4 at a time
ldr r9, [sp, #40] ; count for 8-in-parallel
uadd8 r7, r7, r7 ; flimit * 2
mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
uadd8 r12, r7, r12 ; flimit * 2 + limit
mov lr, #0 ; need 0 in a couple places
|simple_hnext8|
@@ -141,32 +148,30 @@ pstep RN r1
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
ldrb r12, [r2] ; r12: blimit
ldr r12, [r2] ; r12: flimit
ldr r2, c0x80808080
orr r12, r12, r12, lsl #8
ldr r7, [r3] ; limit
; load soure data to r7, r8, r9, r10
ldrh r3, [src, #-2]
pld [src, #23] ; preload for next block
ldrh r4, [src], pstep
orr r12, r12, r12, lsl #16
uadd8 r12, r12, r12 ; flimit * 2
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
uadd8 r12, r12, r7 ; flimit * 2 + limit
pkhbt r7, r3, r4, lsl #16
ldrh r3, [src, #-2]
pld [src, #23]
ldrh r4, [src], pstep
ldr r11, [sp, #40] ; count (r11) for 8-in-parallel
pkhbt r8, r5, r6, lsl #16
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
mov r11, #4 ; double the count. we're doing 4 at a time
mov r11, r11, lsl #1 ; 4-in-parallel
|simple_vnext8|
; vp8_simple_filter_mask() function
@@ -254,23 +259,19 @@ pstep RN r1
; load soure data to r7, r8, r9, r10
ldrneh r3, [src, #-2]
pld [src, #23] ; preload for next block
ldrneh r4, [src], pstep
ldrneh r5, [src, #-2]
pld [src, #23]
ldrneh r6, [src], pstep
pkhbt r7, r3, r4, lsl #16
ldrneh r3, [src, #-2]
pld [src, #23]
ldrneh r4, [src], pstep
pkhbt r8, r5, r6, lsl #16
ldrneh r5, [src, #-2]
pld [src, #23]
ldrneh r6, [src], pstep
bne simple_vnext8

View File

@@ -32,12 +32,9 @@
beq skip_firstpass_filter
;first-pass filter
adr r12, filter8_coeff
ldr r12, _filter8_coeff_
sub r0, r0, r1, lsl #1
add r3, r1, #10 ; preload next low
pld [r0, r3]
add r2, r12, r2, lsl #4 ;calculate filter location
add r0, r0, #3 ;adjust src only for loading convinience
@@ -113,9 +110,6 @@
add r0, r0, r1 ; move to next input line
add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier
pld [r0, r11]
bne first_pass_hloop_v6
;second pass filter
@@ -127,7 +121,7 @@ secondpass_filter
cmp r3, #0
beq skip_secondpass_filter
adr r12, filter8_coeff
ldr r12, _filter8_coeff_
add lr, r12, r3, lsl #4 ;calculate filter location
mov r2, #0x00080000
@@ -251,6 +245,8 @@ skip_secondpass_hloop
;-----------------
;One word each is reserved. Label filter_coeff can be used to access the data.
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
_filter8_coeff_
DCD filter8_coeff
filter8_coeff
DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000

View File

@@ -25,28 +25,6 @@ extern void vp8_filter_block2d_first_pass_armv6
const short *vp8_filter
);
// 8x8
extern void vp8_filter_block2d_first_pass_8x8_armv6
(
unsigned char *src_ptr,
short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_width,
unsigned int output_height,
const short *vp8_filter
);
// 16x16
extern void vp8_filter_block2d_first_pass_16x16_armv6
(
unsigned char *src_ptr,
short *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_width,
unsigned int output_height,
const short *vp8_filter
);
extern void vp8_filter_block2d_second_pass_armv6
(
short *src_ptr,
@@ -165,12 +143,12 @@ void vp8_sixtap_predict8x8_armv6
{
if (yoffset & 0x1)
{
vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
}
else
{
vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
}
}
@@ -207,12 +185,12 @@ void vp8_sixtap_predict16x16_armv6
{
if (yoffset & 0x1)
{
vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
}
else
{
vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
}
}

View File

@@ -9,107 +9,135 @@
*/
#include "vpx_config.h"
#include "vpx_ports/config.h"
#include <math.h>
#include "vp8/common/loopfilter.h"
#include "vp8/common/onyxc_int.h"
#if HAVE_ARMV6
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
#endif
extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#if HAVE_ARMV7
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh);
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh,
unsigned char *v);
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon);
extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon);
extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon);
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon;
extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon;
extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
#endif
#if HAVE_ARMV6
/*ARMV6 loopfilter functions*/
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
(void) simpler_lpf;
vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
(void) simpler_lpf;
vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
(void) simpler_lpf;
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
(void) simpler_lpf;
vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
#endif
@@ -117,60 +145,93 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
/* NEON loopfilter functions */
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
unsigned char mblim = *lfi->mblim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
(void) simpler_lpf;
vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
}
void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
unsigned char mblim = *lfi->mblim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
(void) simpler_lpf;
vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
}
void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
unsigned char blim = *lfi->blim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
(void) simpler_lpf;
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
}
void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
unsigned char blim = *lfi->blim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
(void) simpler_lpf;
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
}
void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
#endif

View File

@@ -12,17 +12,15 @@
#ifndef LOOPFILTER_ARM_H
#define LOOPFILTER_ARM_H
#include "vpx_config.h"
#if HAVE_ARMV6
extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v
@@ -38,29 +36,28 @@ extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
#undef vp8_lf_simple_mb_v
#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6
#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
#undef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
#undef vp8_lf_simple_mb_h
#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6
#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
#endif /* !CONFIG_RUNTIME_CPU_DETECT */
#endif /* HAVE_ARMV6 */
#endif
#endif
#if HAVE_ARMV7
extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bv_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bh_neon);
extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon);
extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon);
extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon);
extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v
@@ -86,8 +83,7 @@ extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon);
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
#endif /* !CONFIG_RUNTIME_CPU_DETECT */
#endif
#endif
#endif /* HAVE_ARMV7 */
#endif /* LOOPFILTER_ARM_H */
#endif

View File

@@ -25,7 +25,7 @@
|vp8_bilinear_predict16x16_neon| PROC
push {r4-r5, lr}
adr r12, bifilter16_coeff
ldr r12, _bifilter16_coeff_
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
@@ -351,6 +351,8 @@ filt_blk2d_spo16x16_loop_neon
;-----------------
_bifilter16_coeff_
DCD bifilter16_coeff
bifilter16_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

View File

@@ -25,7 +25,7 @@
|vp8_bilinear_predict4x4_neon| PROC
push {r4, lr}
adr r12, bifilter4_coeff
ldr r12, _bifilter4_coeff_
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
@@ -124,6 +124,8 @@ skip_secondpass_filter
;-----------------
_bifilter4_coeff_
DCD bifilter4_coeff
bifilter4_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

View File

@@ -25,7 +25,7 @@
|vp8_bilinear_predict8x4_neon| PROC
push {r4, lr}
adr r12, bifilter8x4_coeff
ldr r12, _bifilter8x4_coeff_
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
@@ -129,6 +129,8 @@ skip_secondpass_filter
;-----------------
_bifilter8x4_coeff_
DCD bifilter8x4_coeff
bifilter8x4_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

View File

@@ -25,7 +25,7 @@
|vp8_bilinear_predict8x8_neon| PROC
push {r4, lr}
adr r12, bifilter8_coeff
ldr r12, _bifilter8_coeff_
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
@@ -177,6 +177,8 @@ skip_secondpass_filter
;-----------------
_bifilter8_coeff_
DCD bifilter8_coeff
bifilter8_coeff
DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112

View File

@@ -20,16 +20,19 @@
|vp8_short_inv_walsh4x4_neon| PROC
; read in all four lines of values: d0->d3
vld1.i16 {q0-q1}, [r0@128]
vldm.64 r0, {q0, q1}
; first for loop
vadd.s16 d4, d0, d3 ;a = [0] + [12]
vadd.s16 d6, d1, d2 ;b = [4] + [8]
vsub.s16 d5, d0, d3 ;d = [0] - [12]
vsub.s16 d7, d1, d2 ;c = [4] - [8]
vadd.s16 q0, q2, q3 ; a+b d+c
vsub.s16 q1, q2, q3 ; a-b d-c
vadd.s16 d4, d0, d3 ;a = [0] + [12]
vadd.s16 d5, d1, d2 ;b = [4] + [8]
vsub.s16 d6, d1, d2 ;c = [4] - [8]
vsub.s16 d7, d0, d3 ;d = [0] - [12]
vadd.s16 d0, d4, d5 ;a + b
vadd.s16 d1, d6, d7 ;c + d
vsub.s16 d2, d4, d5 ;a - b
vsub.s16 d3, d7, d6 ;d - c
vtrn.32 d0, d2 ;d0: 0 1 8 9
;d2: 2 3 10 11
@@ -44,22 +47,29 @@
; second for loop
vadd.s16 d4, d0, d3 ;a = [0] + [3]
vadd.s16 d6, d1, d2 ;b = [1] + [2]
vsub.s16 d5, d0, d3 ;d = [0] - [3]
vsub.s16 d7, d1, d2 ;c = [1] - [2]
vadd.s16 d5, d1, d2 ;b = [1] + [2]
vsub.s16 d6, d1, d2 ;c = [1] - [2]
vsub.s16 d7, d0, d3 ;d = [0] - [3]
vmov.i16 q8, #3
vadd.s16 d0, d4, d5 ;e = a + b
vadd.s16 d1, d6, d7 ;f = c + d
vsub.s16 d2, d4, d5 ;g = a - b
vsub.s16 d3, d7, d6 ;h = d - c
vadd.s16 q0, q2, q3 ; a+b d+c
vsub.s16 q1, q2, q3 ; a-b d-c
vadd.i16 q0, q0, q8 ;e/f += 3
vadd.i16 q1, q1, q8 ;g/h += 3
vmov.i16 q2, #3
vadd.i16 q0, q0, q2 ;e/f += 3
vadd.i16 q1, q1, q2 ;g/h += 3
vshr.s16 q0, q0, #3 ;e/f >> 3
vshr.s16 q1, q1, #3 ;g/h >> 3
vst4.i16 {d0,d1,d2,d3}, [r1@128]
vtrn.32 d0, d2
vtrn.32 d1, d3
vtrn.16 d0, d1
vtrn.16 d2, d3
vstmia.16 r1!, {q0}
vstmia.16 r1!, {q1}
bx lr
ENDP ; |vp8_short_inv_walsh4x4_neon|
@@ -67,13 +77,19 @@
;short vp8_short_inv_walsh4x4_1_neon(short *input, short *output)
|vp8_short_inv_walsh4x4_1_neon| PROC
ldrsh r2, [r0] ; load input[0]
add r3, r2, #3 ; add 3
add r2, r1, #16 ; base for last 8 output
asr r0, r3, #3 ; right shift 3
vdup.16 q0, r0 ; load and duplicate
vst1.16 {q0}, [r1@128] ; write back 8
vst1.16 {q0}, [r2@128] ; write back last 8
; load a full line into a neon register
vld1.16 {q0}, [r0]
; extract first element and replicate
vdup.16 q1, d0[0]
; add 3 to all values
vmov.i16 q2, #3
vadd.i16 q3, q1, q2
; right shift
vshr.s16 q3, q3, #3
; write it back
vstmia.16 r1!, {q3}
vstmia.16 r1!, {q3}
bx lr
ENDP ; |vp8_short_inv_walsh4x4_1_neon|

View File

@@ -14,97 +14,109 @@
EXPORT |vp8_loop_filter_vertical_edge_y_neon|
EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; flimit, limit, and thresh should be positive numbers.
; All 16 elements in these variables are equal.
; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src
; r1 int pitch
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; r2 const signed char *flimit
; r3 const signed char *limit
; sp const signed char *thresh,
; sp+4 int count (unused)
|vp8_loop_filter_horizontal_edge_y_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
stmdb sp!, {lr}
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1
add r1, r1, r1
ldr r12, [sp, #4] ; load thresh pointer
vdup.u8 q2, r3 ; duplicate thresh
vld1.u8 {q3}, [r2@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r2@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r2@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r2@128] ; q2
vld1.u8 {q10}, [r12@128] ; q3
sub r2, r2, r1, lsl #1
sub r12, r12, r1, lsl #1
vld1.u8 {q3}, [r2], r1 ; p3
vld1.u8 {q4}, [r2], r1 ; p2
vld1.u8 {q5}, [r2], r1 ; p1
vld1.u8 {q6}, [r2], r1 ; p0
vld1.u8 {q7}, [r2], r1 ; q0
vld1.u8 {q8}, [r2], r1 ; q1
vld1.u8 {q9}, [r2], r1 ; q2
vld1.u8 {q10}, [r2] ; q3
vld1.s8 {d4[], d5[]}, [r12] ; thresh
sub r0, r0, r1, lsl #1
bl vp8_loop_filter_neon
vst1.u8 {q5}, [r2@128], r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r2@128], r1 ; store oq0
vst1.u8 {q8}, [r12@128], r1 ; store oq1
vst1.u8 {q5}, [r0], r1 ; store op1
vst1.u8 {q6}, [r0], r1 ; store op0
vst1.u8 {q7}, [r0], r1 ; store oq0
vst1.u8 {q8}, [r0], r1 ; store oq1
pop {pc}
ldmia sp!, {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
ldr r12, [sp, #4] ; load thresh
stmdb sp!, {lr}
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r2, [sp, #8] ; load v ptr
vdup.u8 q2, r12 ; duplicate thresh
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r3], r1 ; p3
vld1.u8 {d8}, [r3], r1 ; p2
vld1.u8 {d10}, [r3], r1 ; p1
vld1.u8 {d12}, [r3], r1 ; p0
vld1.u8 {d14}, [r3], r1 ; q0
vld1.u8 {d16}, [r3], r1 ; q1
vld1.u8 {d18}, [r3], r1 ; q2
vld1.u8 {d20}, [r3] ; q3
vld1.u8 {d6}, [r3@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r3@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r3@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r3@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r3@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r3@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r3@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r3@64] ; q3
vld1.u8 {d21}, [r12@64] ; q3
ldr r3, [sp, #4] ; load thresh pointer
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d7}, [r12], r1 ; p3
vld1.u8 {d9}, [r12], r1 ; p2
vld1.u8 {d11}, [r12], r1 ; p1
vld1.u8 {d13}, [r12], r1 ; p0
vld1.u8 {d15}, [r12], r1 ; q0
vld1.u8 {d17}, [r12], r1 ; q1
vld1.u8 {d19}, [r12], r1 ; q2
vld1.u8 {d21}, [r12] ; q3
vld1.s8 {d4[], d5[]}, [r3] ; thresh
bl vp8_loop_filter_neon
sub r0, r0, r1, lsl #1
sub r2, r2, r1, lsl #1
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r2@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r2@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r2@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64] ; store u oq1
vst1.u8 {d17}, [r2@64] ; store v oq1
vst1.u8 {d10}, [r0], r1 ; store u op1
vst1.u8 {d11}, [r2], r1 ; store v op1
vst1.u8 {d12}, [r0], r1 ; store u op0
vst1.u8 {d13}, [r2], r1 ; store v op0
vst1.u8 {d14}, [r0], r1 ; store u oq0
vst1.u8 {d15}, [r2], r1 ; store v oq0
vst1.u8 {d16}, [r0] ; store u oq1
vst1.u8 {d17}, [r2] ; store v oq1
pop {pc}
ldmia sp!, {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
@@ -112,38 +124,39 @@
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src
; r1 int pitch
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; r0 unsigned char *src,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 int count (unused)
|vp8_loop_filter_vertical_edge_y_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
stmdb sp!, {lr}
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
sub r2, r0, #4 ; src ptr down by 4 columns
add r1, r1, r1
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1, asr #1
sub r0, r0, #2 ; dst ptr
ldr r12, [sp, #4] ; load thresh pointer
vld1.u8 {d6}, [r2], r1
vld1.u8 {d8}, [r12], r1
vld1.u8 {d6}, [r2], r1 ; load first 8-line src data
vld1.u8 {d8}, [r2], r1
vld1.u8 {d10}, [r2], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d12}, [r2], r1
vld1.u8 {d14}, [r2], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d16}, [r2], r1
vld1.u8 {d18}, [r2], r1
vld1.u8 {d20}, [r12], r1
vld1.u8 {d20}, [r2], r1
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
vld1.u8 {d9}, [r12], r1
vld1.u8 {d9}, [r2], r1
vld1.u8 {d11}, [r2], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d13}, [r2], r1
vld1.u8 {d15}, [r2], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d19}, [r2]
vld1.u8 {d21}, [r12]
vld1.u8 {d17}, [r2], r1
vld1.u8 {d19}, [r2], r1
vld1.u8 {d21}, [r2]
;transpose to 8x16 matrix
vtrn.32 q3, q7
@@ -151,8 +164,6 @@
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r3 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
@@ -167,34 +178,28 @@
vswp d12, d11
vswp d16, d13
sub r0, r0, #2 ; dst ptr
vswp d14, d12
vswp d16, d15
add r12, r0, r1, asr #1
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r0], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r0], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0], r1
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r0]
pop {pc}
ldmia sp!, {pc}
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
@@ -204,36 +209,38 @@
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_vertical_edge_uv_neon| PROC
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
stmdb sp!, {lr}
sub r12, r0, #4 ; move u pointer down by 4 columns
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r2, [sp, #8] ; load v ptr
vdup.u8 q1, r3 ; duplicate limit
sub r3, r2, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r12], r1 ;load u data
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d8}, [r12], r1
vld1.u8 {d9}, [r3], r1
vld1.u8 {d10}, [r12], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d14}, [r12], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d18}, [r12], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d20}, [r12]
sub r3, r2, #4 ; move v pointer down by 4 columns
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d9}, [r3], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d21}, [r3]
ldr r12, [sp, #4] ; load thresh
ldr r12, [sp, #4] ; load thresh pointer
;transpose to 8x16 matrix
vtrn.32 q3, q7
@@ -241,8 +248,6 @@
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r12 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
@@ -253,16 +258,18 @@
vtrn.8 q7, q8
vtrn.8 q9, q10
vld1.s8 {d4[], d5[]}, [r12] ; thresh
bl vp8_loop_filter_neon
sub r0, r0, #2
sub r2, r2, #2
vswp d12, d11
vswp d16, d13
vswp d14, d12
vswp d16, d15
sub r0, r0, #2
sub r2, r2, #2
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
@@ -281,7 +288,7 @@
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
pop {pc}
ldmia sp!, {pc}
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
; void vp8_loop_filter_neon();
@@ -301,6 +308,7 @@
; q9 q2
; q10 q3
|vp8_loop_filter_neon| PROC
ldr r12, _lf_coeff_
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
@@ -309,44 +317,42 @@
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q3, q3, q4
vmax.u8 q15, q11, q12
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
vmax.u8 q15, q15, q3
vmov.u8 q10, #0x80 ; 0x80
vadd.u8 q0, q0, q0 ; flimit * 2
vadd.u8 q0, q0, q1 ; flimit * 2 + limit
vcge.u8 q15, q1, q15
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
vshr.u8 q2, q2, #1 ; a = a / 2
vqadd.u8 q9, q9, q2 ; a = b + a
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
vcge.u8 q15, q1, q15
vld1.u8 {q0}, [r12]!
; vp8_filter() function
; convert to signed
veor q7, q7, q10 ; qs0
vshr.u8 q2, q2, #1 ; a = a / 2
veor q6, q6, q10 ; ps0
veor q7, q7, q0 ; qs0
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
veor q8, q8, q0 ; qs1
veor q5, q5, q10 ; ps1
vqadd.u8 q9, q9, q2 ; a = b + a
veor q8, q8, q10 ; qs1
vmov.u8 q10, #3 ; #3
vld1.u8 {q10}, [r12]!
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q11, d15, d13
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
vmovl.u8 q4, d20
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
@@ -361,7 +367,7 @@
vaddw.s8 q2, q2, d2
vaddw.s8 q11, q11, d3
vmov.u8 q9, #4 ; #4
vld1.u8 {q9}, [r12]!
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
@@ -373,20 +379,19 @@
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
; outer tap adjustments: ++vp8_filter >> 1
vrshr.s8 q1, q1, #1
vbic q1, q1, q14 ; vp8_filter &= ~hev
vmov.u8 q0, #0x80 ; 0x80
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
veor q5, q13, q0 ; *op1 = u^0x80
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
veor q5, q13, q0 ; *op1 = u^0x80
veor q8, q12, q0 ; *oq1 = u^0x80
bx lr
@@ -394,4 +399,12 @@
;-----------------
_lf_coeff_
DCD lf_coeff
lf_coeff
DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101
END

View File

@@ -9,109 +9,107 @@
;
;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
EXPORT |vp8_loop_filter_bhs_neon|
EXPORT |vp8_loop_filter_mbhs_neon|
EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
;are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
; r0 unsigned char *s,
; r1 int p, //pitch
; r2 const signed char *flimit,
; r3 const signed char *limit,
; stack(r4) const signed char *thresh,
; //stack(r5) int count --unused
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q5}, [r3@128], r1 ; p0
vld1.u8 {q8}, [r0@128] ; q1
vld1.u8 {q6}, [r3@128] ; p1
ldr r12, _lfhy_coeff_
vld1.u8 {q5}, [r0], r1 ; p1
vld1.s8 {d2[], d3[]}, [r2] ; flimit
vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
vld1.u8 {q6}, [r0], r1 ; p0
vld1.u8 {q0}, [r12]! ; 0x80
vld1.u8 {q7}, [r0], r1 ; q0
vld1.u8 {q10}, [r12]! ; 0x03
vld1.u8 {q8}, [r0] ; q1
;vp8_filter_mask() function
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q13, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
;vp8_filter() function
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
vadd.u8 q1, q1, q1 ; flimit * 2
vadd.u8 q1, q1, q13 ; flimit * 2 + limit
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
;;;;;;;;;;
;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0)
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q3, d15, d13
vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
vmul.s16 q3, q3, q13
;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0)
vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0)
vadd.s16 q12, q3, q3
vmov.u8 q10, #0x03 ; 0x03
vmov.u8 q9, #0x04 ; 0x04
vld1.u8 {q9}, [r12]! ; 0x04
vadd.s16 q2, q2, q11
vadd.s16 q3, q3, q12
vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q3, q3, d9
;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d9, q3
;;;;;;;;;;;;;
vand q14, q4, q15 ; vp8_filter &= mask
vand q4, q4, q15 ; vp8_filter &= mask
vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q4, q3, #3 ; Filter1 >>= 3
vshr.s8 q4, q4, #3 ; Filter1 >>= 3
sub r0, r0, r1
sub r0, r0, r1, lsl #1
;calculate output
vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
add r3, r0, r1
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
vst1.u8 {q6}, [r3@128] ; store op0
vst1.u8 {q7}, [r0@128] ; store oq0
vst1.u8 {q6}, [r0] ; store op0
vst1.u8 {q7}, [r3] ; store oq0
bx lr
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
;-----------------
|vp8_loop_filter_bhs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate blim
add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
pop {r4, lr}
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_mbhs_neon| PROC
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
_lfhy_coeff_
DCD lfhy_coeff
lfhy_coeff
DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
END

View File

@@ -9,54 +9,60 @@
;
;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
EXPORT |vp8_loop_filter_bvs_neon|
EXPORT |vp8_loop_filter_mbvs_neon|
EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit
;are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
; r0 unsigned char *s,
; r1 int p, //pitch
; r2 const signed char *flimit,
; r3 const signed char *limit,
; stack(r4) const signed char *thresh,
; //stack(r5) int count --unused
|vp8_loop_filter_simple_vertical_edge_neon| PROC
sub r0, r0, #2 ; move src pointer down by 2 columns
add r12, r1, r1
add r3, r0, r1
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1
vld1.s8 {d2[], d3[]}, [r2] ; flimit
vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
ldr r12, _vlfy_coeff_
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vld1.u8 {q0}, [r12]! ; 0x80
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vld1.u8 {q11}, [r12]! ; 0x03
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vld1.u8 {q12}, [r12]! ; 0x04
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
vswp d7, d10
vswp d12, d9
;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6
;vp8_filter_mask() function
;vp8_hevmask() function
sub r0, r0, r1, lsl #4
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q11, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
@@ -64,91 +70,87 @@
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
vadd.u8 q1, q1, q1 ; flimit * 2
vadd.u8 q1, q1, q13 ; flimit * 2 + limit
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
;vp8_filter() function
;;;;;;;;;;
;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0)
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
vsubl.s8 q13, d9, d11
vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vmul.s16 q13, q13, q11
;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0)
vadd.s16 q14, q13, q13
vadd.s16 q2, q2, q10
vadd.s16 q13, q13, q14
vmov.u8 q11, #0x03 ; 0x03
vmov.u8 q12, #0x04 ; 0x04
;vqadd.s8 q1, q1, q2
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d3
vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d29
vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d29, q13
vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d3, q13
add r0, r0, #1
add r3, r0, r1
add r2, r0, r1
;;;;;;;;;;;
vand q14, q14, q15 ; vp8_filter &= mask
vand q1, q1, q15 ; vp8_filter &= mask
vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q14, q3, #3 ; Filter1 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
;calculate output
vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1)
vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
add r12, r1, r1
veor q6, q11, q0 ; *op0 = u^0x80
add r3, r2, r1
vswp d13, d14
add r12, r3, r1
;store op1, op0, oq0, oq1
vst2.8 {d12[0], d13[0]}, [r0], r12
vst2.8 {d12[1], d13[1]}, [r3], r12
vst2.8 {d12[2], d13[2]}, [r0], r12
vst2.8 {d12[3], d13[3]}, [r3], r12
vst2.8 {d12[4], d13[4]}, [r0], r12
vst2.8 {d12[5], d13[5]}, [r3], r12
vst2.8 {d12[6], d13[6]}, [r0], r12
vst2.8 {d12[7], d13[7]}, [r3], r12
vst2.8 {d14[0], d15[0]}, [r0], r12
vst2.8 {d14[1], d15[1]}, [r3], r12
vst2.8 {d14[2], d15[2]}, [r0], r12
vst2.8 {d14[3], d15[3]}, [r3], r12
vst2.8 {d14[4], d15[4]}, [r0], r12
vst2.8 {d14[5], d15[5]}, [r3], r12
vst2.8 {d14[6], d15[6]}, [r0], r12
vst2.8 {d14[7], d15[7]}, [r3]
vst2.8 {d12[0], d13[0]}, [r0]
vst2.8 {d12[1], d13[1]}, [r2]
vst2.8 {d12[2], d13[2]}, [r3]
vst2.8 {d12[3], d13[3]}, [r12], r1
add r0, r12, r1
vst2.8 {d12[4], d13[4]}, [r12]
vst2.8 {d12[5], d13[5]}, [r0], r1
add r2, r0, r1
vst2.8 {d12[6], d13[6]}, [r0]
vst2.8 {d12[7], d13[7]}, [r2], r1
add r3, r2, r1
vst2.8 {d14[0], d15[0]}, [r2]
vst2.8 {d14[1], d15[1]}, [r3], r1
add r12, r3, r1
vst2.8 {d14[2], d15[2]}, [r3]
vst2.8 {d14[3], d15[3]}, [r12], r1
add r0, r12, r1
vst2.8 {d14[4], d15[4]}, [r12]
vst2.8 {d14[5], d15[5]}, [r0], r1
add r2, r0, r1
vst2.8 {d14[6], d15[6]}, [r0]
vst2.8 {d14[7], d15[7]}, [r2]
bx lr
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
;-----------------
|vp8_loop_filter_bvs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
mov r4, r0
add r0, r0, #4
vdup.s8 q1, r3 ; duplicate blim
bl vp8_loop_filter_simple_vertical_edge_neon
; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
add r0, r4, #8
bl vp8_loop_filter_simple_vertical_edge_neon
add r0, r4, #12
pop {r4, lr}
b vp8_loop_filter_simple_vertical_edge_neon
ENDP ;|vp8_loop_filter_bvs_neon|
_vlfy_coeff_
DCD vlfy_coeff
vlfy_coeff
DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_mbvs_neon| PROC
ldrb r3, [r2] ; load mblim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp8_loop_filter_simple_vertical_edge_neon
ENDP ;|vp8_loop_filter_bvs_neon|
END

View File

@@ -14,143 +14,155 @@
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; flimit, limit, and thresh should be positive numbers.
; All 16 elements in these variables are equal.
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 int count (unused)
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
push {lr}
add r1, r1, r1 ; double stride
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
stmdb sp!, {lr}
sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r12, [sp, #4] ; load thresh pointer
vld1.u8 {q3}, [r0@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r0@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r0@128], r1 ; q2
vld1.u8 {q10}, [r12@128], r1 ; q3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #2
add r0, r12, r1, lsr #1
vst1.u8 {q4}, [r12@128],r1 ; store op2
vst1.u8 {q5}, [r0@128],r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r0@128],r1 ; store oq0
vst1.u8 {q8}, [r12@128] ; store oq1
vst1.u8 {q9}, [r0@128] ; store oq2
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r0@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r0@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r0@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r0@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r0@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r0@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r0@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r0@64], r1 ; q3
vld1.u8 {d21}, [r12@64], r1 ; q3
vld1.u8 {q3}, [r0], r1 ; p3
vld1.s8 {d2[], d3[]}, [r3] ; limit
vld1.u8 {q4}, [r0], r1 ; p2
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.u8 {q5}, [r0], r1 ; p1
vld1.u8 {q6}, [r0], r1 ; p0
vld1.u8 {q7}, [r0], r1 ; q0
vld1.u8 {q8}, [r0], r1 ; q1
vld1.u8 {q9}, [r0], r1 ; q2
vld1.u8 {q10}, [r0], r1 ; q3
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
sub r12, r12, r1, lsl #3
add r0, r0, r1
add r2, r0, r1
add r3, r2, r1
vst1.u8 {q4}, [r0] ; store op2
vst1.u8 {q5}, [r2] ; store op1
vst1.u8 {q6}, [r3], r1 ; store op0
add r12, r3, r1
vst1.u8 {q7}, [r3] ; store oq0
vst1.u8 {q8}, [r12], r1 ; store oq1
vst1.u8 {q9}, [r12] ; store oq2
ldmia sp!, {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
stmdb sp!, {lr}
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r3, [sp, #8] ; load v ptr
ldr r12, [sp, #4] ; load thresh pointer
sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r0], r1 ; p3
vld1.u8 {d7}, [r3], r1 ; p3
vld1.u8 {d8}, [r0], r1 ; p2
vld1.u8 {d9}, [r3], r1 ; p2
vld1.u8 {d10}, [r0], r1 ; p1
vld1.u8 {d11}, [r3], r1 ; p1
vld1.u8 {d12}, [r0], r1 ; p0
vld1.u8 {d13}, [r3], r1 ; p0
vld1.u8 {d14}, [r0], r1 ; q0
vld1.u8 {d15}, [r3], r1 ; q0
vld1.u8 {d16}, [r0], r1 ; q1
vld1.u8 {d17}, [r3], r1 ; q1
vld1.u8 {d18}, [r0], r1 ; q2
vld1.u8 {d19}, [r3], r1 ; q2
vld1.u8 {d20}, [r0], r1 ; q3
vld1.u8 {d21}, [r3], r1 ; q3
vld1.s8 {d4[], d5[]}, [r12] ; thresh
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
sub r3, r3, r1, lsl #3
add r0, r0, r1
add r12, r12, r1
add r3, r3, r1
vst1.u8 {d8}, [r0@64], r1 ; store u op2
vst1.u8 {d9}, [r12@64], r1 ; store v op2
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r12@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r12@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r12@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64], r1 ; store u oq1
vst1.u8 {d17}, [r12@64], r1 ; store v oq1
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
vst1.u8 {d8}, [r0], r1 ; store u op2
vst1.u8 {d9}, [r3], r1 ; store v op2
vst1.u8 {d10}, [r0], r1 ; store u op1
vst1.u8 {d11}, [r3], r1 ; store v op1
vst1.u8 {d12}, [r0], r1 ; store u op0
vst1.u8 {d13}, [r3], r1 ; store v op0
vst1.u8 {d14}, [r0], r1 ; store u oq0
vst1.u8 {d15}, [r3], r1 ; store v oq0
vst1.u8 {d16}, [r0], r1 ; store u oq1
vst1.u8 {d17}, [r3], r1 ; store v oq1
vst1.u8 {d18}, [r0], r1 ; store u oq2
vst1.u8 {d19}, [r3], r1 ; store v oq2
pop {pc}
ldmia sp!, {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src,
; r1 int pitch,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 int count (unused)
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
stmdb sp!, {lr}
sub r0, r0, #4 ; move src pointer down by 4 columns
vdup.s8 q2, r12 ; thresh
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
ldr r12, [sp, #4] ; load thresh pointer
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r12], r1
sub sp, sp, #32
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r12], r1
vld1.u8 {d7}, [r0], r1 ; load second 8-line src data
vld1.u8 {d9}, [r0], r1
vld1.u8 {d11}, [r0], r1
vld1.u8 {d13}, [r0], r1
vld1.u8 {d15}, [r0], r1
vld1.u8 {d17}, [r0], r1
vld1.u8 {d19}, [r0], r1
vld1.u8 {d21}, [r0], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
@@ -168,17 +180,29 @@
vtrn.8 q7, q8
vtrn.8 q9, q10
sub r0, r0, r1, lsl #3
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.s8 {d2[], d3[]}, [r3] ; limit
mov r12, sp
vst1.u8 {q3}, [r12]!
vst1.u8 {q10}, [r12]!
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
sub r0, r0, r1, lsl #4
add r2, r0, r1
add r3, r2, r1
vld1.u8 {q3}, [sp]!
vld1.u8 {q10}, [sp]!
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
add r12, r3, r1
vtrn.16 q3, q5
vtrn.16 q4, q6
@@ -191,30 +215,36 @@
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d6}, [r0]
vst1.8 {d8}, [r2]
vst1.8 {d10}, [r3]
vst1.8 {d12}, [r12], r1
add r0, r12, r1
vst1.8 {d14}, [r12]
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
add r2, r0, r1
vst1.8 {d18}, [r0]
vst1.8 {d20}, [r2], r1
add r3, r2, r1
vst1.8 {d7}, [r2]
vst1.8 {d9}, [r3], r1
add r12, r3, r1
vst1.8 {d11}, [r3]
vst1.8 {d13}, [r12], r1
add r0, r12, r1
vst1.8 {d15}, [r12]
vst1.8 {d17}, [r0], r1
add r2, r0, r1
vst1.8 {d19}, [r0]
vst1.8 {d21}, [r2]
pop {pc}
ldmia sp!, {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
@@ -223,29 +253,30 @@
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move u pointer down by 4 columns
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, #4 ; move v pointer down by 4 columns
stmdb sp!, {lr}
sub r0, r0, #4 ; move src pointer down by 4 columns
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r3, [sp, #8] ; load v ptr
ldr r12, [sp, #4] ; load thresh pointer
sub r3, r3, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r0], r1 ;load u data
vld1.u8 {d7}, [r12], r1 ;load v data
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r12], r1
vld1.u8 {d9}, [r3], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r12], r1
vld1.u8 {d21}, [r3], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
@@ -263,11 +294,19 @@
vtrn.8 q7, q8
vtrn.8 q9, q10
sub r0, r0, r1, lsl #3
sub sp, sp, #32
vld1.s8 {d4[], d5[]}, [r12] ; thresh
mov r12, sp
vst1.u8 {q3}, [r12]!
vst1.u8 {q10}, [r12]!
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
sub r0, r0, r1, lsl #3
sub r3, r3, r1, lsl #3
vld1.u8 {q3}, [sp]!
vld1.u8 {q10}, [sp]!
;transpose to 16x8 matrix
vtrn.32 q3, q7
@@ -287,23 +326,23 @@
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d7}, [r3], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d9}, [r3], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d11}, [r3], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d13}, [r3], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d15}, [r3], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d17}, [r3], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
vst1.8 {d19}, [r3], r1
vst1.8 {d20}, [r0], r1
vst1.8 {d21}, [r3], r1
pop {pc}
ldmia sp!, {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
; void vp8_mbloop_filter_neon()
@@ -311,33 +350,41 @@
; functions do the necessary load, transpose (if necessary), preserve (if
; necessary) and store.
; r0,r1 PRESERVE
; r2 mblimit
; r3 limit
; TODO:
; The vertical filter writes p3/q3 back out because two 4 element writes are
; much simpler than ordering and writing two 3 element sets (or three 2 elements
; sets, or whichever other combinations are possible).
; If we can preserve q3 and q10, the vertical filter will be able to avoid
; storing those values on the stack and reading them back after the filter.
; r0,r1 PRESERVE
; r2 flimit
; r3 PRESERVE
; q1 limit
; q2 thresh
; q3 p3 PRESERVE
; q3 p3
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3 PRESERVE
; q10 q3
|vp8_mbloop_filter_neon| PROC
ldr r12, _mblf_coeff_
; vp8_filter_mask
vabd.u8 q11, q3, q4 ; abs(p3 - p2)
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q1, q9, q8 ; abs(q2 - q1)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q1, q1, q0
vmax.u8 q3, q3, q0
vmax.u8 q15, q11, q12
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
@@ -345,53 +392,51 @@
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
vmax.u8 q15, q15, q1
vmax.u8 q15, q15, q3
vdup.u8 q1, r3 ; limit
vdup.u8 q2, r2 ; mblimit
vld1.s8 {d4[], d5[]}, [r2] ; flimit
vmov.u8 q0, #0x80 ; 0x80
vld1.u8 {q0}, [r12]!
vadd.u8 q2, q2, q2 ; flimit * 2
vadd.u8 q2, q2, q1 ; flimit * 2 + limit
vcge.u8 q15, q1, q15
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
vmov.u16 q11, #3 ; #3
vshr.u8 q1, q1, #1 ; a = a / 2
vqadd.u8 q12, q12, q1 ; a = b + a
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
; vp8_filter
; convert to signed
veor q7, q7, q0 ; qs0
vshr.u8 q1, q1, #1 ; a = a / 2
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
vqadd.u8 q12, q12, q1 ; a = b + a
veor q8, q8, q0 ; qs1
veor q4, q4, q0 ; ps2
veor q9, q9, q0 ; qs2
vorr q14, q13, q14 ; vp8_hevmask
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
vsubl.s8 q2, d14, d12 ; qs0 - ps0
vsubl.s8 q13, d15, d13
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vadd.s16 q10, q2, q2 ; 3 * (qs0 - ps0)
vadd.s16 q11, q13, q13
vand q15, q15, q12 ; vp8_filter_mask
vmul.i16 q13, q13, q11
vadd.s16 q2, q2, q10
vadd.s16 q13, q13, q11
vmov.u8 q12, #3 ; #3
vld1.u8 {q12}, [r12]! ; #3
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d3
vmov.u8 q11, #4 ; #4
vld1.u8 {q11}, [r12]! ; #4
; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d2, q2
@@ -399,23 +444,27 @@
vand q1, q1, q15 ; vp8_filter &= mask
vmov.u16 q15, #63 ; #63
vld1.u8 {q15}, [r12]! ; #63
;
vand q13, q1, q14 ; Filter2 &= hev
vld1.u8 {d7}, [r12]! ; #9
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
vmov q0, q15
vld1.u8 {d6}, [r12]! ; #18
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
vmov q11, q15
vmov q10, q15
vmov q12, q15
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
vld1.u8 {d5}, [r12]! ; #27
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
vbic q1, q1, q14 ; vp8_filter &= ~hev
@@ -423,47 +472,49 @@
; roughly 1/7th difference across boundary
; roughly 2/7th difference across boundary
; roughly 3/7th difference across boundary
vmov.u8 d5, #9 ; #9
vmov.u8 d4, #18 ; #18
vmov q11, q15
vmov q13, q15
vmov q14, q15
vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
vmlal.s8 q11, d3, d5
vmov.u8 d5, #27 ; #27
vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
vmlal.s8 q13, d3, d4
vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
vmlal.s8 q10, d2, d7 ; Filter2 * 9
vmlal.s8 q11, d3, d7
vmlal.s8 q12, d2, d6 ; Filter2 * 18
vmlal.s8 q13, d3, d6
vmlal.s8 q14, d2, d5 ; Filter2 * 27
vmlal.s8 q15, d3, d5
vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
vqshrn.s16 d1, q11, #7
vqshrn.s16 d20, q10, #7 ; u = clamp((63 + Filter2 * 9)>>7)
vqshrn.s16 d21, q11, #7
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
vqshrn.s16 d25, q13, #7
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
vqshrn.s16 d29, q15, #7
vmov.u8 q1, #0x80 ; 0x80
vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
vqsub.s8 q11, q9, q10 ; s = clamp(qs2 - u)
vqadd.s8 q10, q4, q10 ; s = clamp(ps2 + u)
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
veor q9, q11, q1 ; *oq2 = s^0x80
veor q4, q0, q1 ; *op2 = s^0x80
veor q8, q13, q1 ; *oq1 = s^0x80
veor q5, q12, q1 ; *op2 = s^0x80
veor q7, q15, q1 ; *oq0 = s^0x80
veor q6, q14, q1 ; *op0 = s^0x80
veor q9, q11, q0 ; *oq2 = s^0x80
veor q4, q10, q0 ; *op2 = s^0x80
veor q8, q13, q0 ; *oq1 = s^0x80
veor q5, q12, q0 ; *op2 = s^0x80
veor q7, q15, q0 ; *oq0 = s^0x80
veor q6, q14, q0 ; *op0 = s^0x80
bx lr
ENDP ; |vp8_mbloop_filter_neon|
;-----------------
_mblf_coeff_
DCD mblf_coeff
mblf_coeff
DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080
DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303
DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404
DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f
DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212
DCD 0x1b1b1b1b, 0x1b1b1b1b
END

View File

@@ -31,7 +31,7 @@
;result of the multiplication that is needed in IDCT.
|vp8_short_idct4x4llm_neon| PROC
adr r12, idct_coeff
ldr r12, _idct_coeff_
vld1.16 {q1, q2}, [r0]
vld1.16 {d0}, [r12]
@@ -114,6 +114,8 @@
;-----------------
_idct_coeff_
DCD idct_coeff
idct_coeff
DCD 0x4e7b4e7b, 0x8a8c8a8c

View File

@@ -15,17 +15,6 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter16_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
@@ -44,7 +33,7 @@ filter16_coeff
|vp8_sixtap_predict16x16_neon| PROC
push {r4-r5, lr}
adr r12, filter16_coeff
ldr r12, _filter16_coeff_
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
@@ -487,4 +476,17 @@ secondpass_only_inner_loop_neon
ENDP
;-----------------
_filter16_coeff_
DCD filter16_coeff
filter16_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
END

View File

@@ -15,17 +15,6 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter4_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
@@ -36,7 +25,7 @@ filter4_coeff
|vp8_sixtap_predict_neon| PROC
push {r4, lr}
adr r12, filter4_coeff
ldr r12, _filter4_coeff_
ldr r4, [sp, #8] ;load parameters from stack
ldr lr, [sp, #12] ;load parameters from stack
@@ -419,4 +408,16 @@ secondpass_filter4x4_only
;-----------------
_filter4_coeff_
DCD filter4_coeff
filter4_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
END

View File

@@ -15,17 +15,6 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
@@ -36,7 +25,7 @@ filter8_coeff
|vp8_sixtap_predict8x4_neon| PROC
push {r4-r5, lr}
adr r12, filter8_coeff
ldr r12, _filter8_coeff_
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
@@ -470,4 +459,16 @@ secondpass_filter8x4_only
;-----------------
_filter8_coeff_
DCD filter8_coeff
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
END

View File

@@ -15,17 +15,6 @@
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
; r0 unsigned char *src_ptr,
; r1 int src_pixels_per_line,
; r2 int xoffset,
@@ -36,7 +25,7 @@ filter8_coeff
|vp8_sixtap_predict8x8_neon| PROC
push {r4-r5, lr}
adr r12, filter8_coeff
ldr r12, _filter8_coeff_
ldr r4, [sp, #12] ;load parameters from stack
ldr r5, [sp, #16] ;load parameters from stack
@@ -521,4 +510,16 @@ filt_blk2d_spo8x8_loop_neon
;-----------------
_filter8_coeff_
DCD filter8_coeff
filter8_coeff
DCD 0, 0, 128, 0, 0, 0, 0, 0
DCD 0, -6, 123, 12, -1, 0, 0, 0
DCD 2, -11, 108, 36, -8, 1, 0, 0
DCD 0, -9, 93, 50, -6, 0, 0, 0
DCD 3, -16, 77, 77, -16, 3, 0, 0
DCD 0, -6, 50, 93, -9, 0, 0, 0
DCD 1, -8, 36, 108, -11, 2, 0, 0
DCD 0, -1, 12, 123, -6, 0, 0, 0
END

View File

@@ -9,14 +9,27 @@
*/
#include "vpx_config.h"
#include "vpx/vpx_codec.h"
#include "vpx_ports/asm_offsets.h"
#include "vpx_ports/config.h"
#include <stddef.h>
#include "vpx_scale/yv12config.h"
BEGIN
#define ct_assert(name,cond) \
static void assert_##name(void) UNUSED;\
static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
/* vpx_scale */
#define DEFINE(sym, val) int sym = val;
/*
#define BLANK() asm volatile("\n->" : : )
*/
/*
* int main(void)
* {
*/
//vpx_scale
DEFINE(yv12_buffer_config_y_width, offsetof(YV12_BUFFER_CONFIG, y_width));
DEFINE(yv12_buffer_config_y_height, offsetof(YV12_BUFFER_CONFIG, y_height));
DEFINE(yv12_buffer_config_y_stride, offsetof(YV12_BUFFER_CONFIG, y_stride));
@@ -27,14 +40,10 @@ DEFINE(yv12_buffer_config_y_buffer, offsetof(YV12_BUFFER_CONFIG, y_b
DEFINE(yv12_buffer_config_u_buffer, offsetof(YV12_BUFFER_CONFIG, u_buffer));
DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_buffer));
DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
DEFINE(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS);
END
/* add asserts for any offset that is not supported by assembly code */
/* add asserts for any size that is not supported by assembly code */
#if HAVE_ARMV7
/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
#endif
//add asserts for any offset that is not supported by assembly code
//add asserts for any size that is not supported by assembly code
/*
* return 0;
* }
*/

View File

@@ -14,12 +14,17 @@
void vpx_log(const char *format, ...);
#include "vpx_ports/config.h"
#include "vpx_scale/yv12config.h"
#include "../../vpx_ports/config.h"
#include "../../vpx_scale/yv12config.h"
#include "mv.h"
#include "treecoder.h"
#include "subpixel.h"
#include "vpx_ports/mem.h"
#include "../../vpx_ports/mem.h"
#include "../../vpx_config.h"
#if CONFIG_OPENCL
#include "opencl/vp8_opencl.h"
#endif
#define TRUE 1
#define FALSE 0
@@ -73,19 +78,19 @@ typedef enum
typedef enum
{
DC_PRED, /* average of above and left pixels */
V_PRED, /* vertical prediction */
H_PRED, /* horizontal prediction */
TM_PRED, /* Truemotion prediction */
B_PRED, /* block based prediction, each block has its own prediction mode */
DC_PRED = 0, /* average of above and left pixels */
V_PRED = 1, /* vertical prediction */
H_PRED = 2, /* horizontal prediction */
TM_PRED = 3, /* Truemotion prediction */
B_PRED = 4, /* block based prediction, each block has its own prediction mode */
NEARESTMV,
NEARMV,
ZEROMV,
NEWMV,
SPLITMV,
NEARESTMV = 5,
NEARMV = 6,
ZEROMV = 7,
NEWMV = 8,
SPLITMV = 9,
MB_MODE_COUNT
MB_MODE_COUNT = 10
} MB_PREDICTION_MODE;
/* Macroblock level features */
@@ -137,11 +142,16 @@ typedef enum
modes for the Y blocks to the left and above us; for interframes, there
is a single probability table. */
union b_mode_info
typedef struct
{
B_PREDICTION_MODE as_mode;
int_mv mv;
};
B_PREDICTION_MODE mode;
union
{
int as_int;
MV as_mv;
} mv;
} B_MODE_INFO;
typedef enum
{
@@ -156,43 +166,79 @@ typedef struct
{
MB_PREDICTION_MODE mode, uv_mode;
MV_REFERENCE_FRAME ref_frame;
int_mv mv;
union
{
int as_int;
MV as_mv;
} mv;
unsigned char partitioning;
unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
unsigned char dc_diff;
unsigned char need_to_clamp_mvs;
unsigned char segment_id; /* Which set of segmentation parameters should be used for this MB */
unsigned char force_no_skip; /* encoder only */
} MB_MODE_INFO;
typedef struct
{
MB_MODE_INFO mbmi;
union b_mode_info bmi[16];
B_MODE_INFO bmi[16];
} MODE_INFO;
typedef struct
{
short *qcoeff;
short *dqcoeff;
unsigned char *predictor;
short *diff;
short *qcoeff_base;
int qcoeff_offset;
short *dqcoeff_base;
int dqcoeff_offset;
unsigned char *predictor_base;
int predictor_offset;
short *diff_base;
int diff_offset;
short *dequant;
#if CONFIG_OPENCL
cl_command_queue cl_commands; //pointer to macroblock CL command queue
cl_mem cl_diff_mem;
cl_mem cl_predictor_mem;
cl_mem cl_qcoeff_mem;
cl_mem cl_dqcoeff_mem;
cl_mem cl_eobs_mem;
cl_mem cl_dequant_mem; //Block-specific, not shared
cl_bool sixtap_filter; //Subpixel Prediction type (true=sixtap, false=bilinear)
#endif
/* 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries */
unsigned char **base_pre;
unsigned char **base_pre; //previous frame, same Macroblock, base pointer
int pre;
int pre_stride;
unsigned char **base_dst;
unsigned char **base_dst; //destination base pointer
int dst;
int dst_stride;
int eob;
int eob; //only used in encoder? Decoder uses MBD.eobs
char *eobs_base; //beginning of MB.eobs
B_MODE_INFO bmi;
union b_mode_info bmi;
} BLOCKD;
typedef struct MacroBlockD
typedef struct
{
DECLARE_ALIGNED(16, short, diff[400]); /* from idct diff */
DECLARE_ALIGNED(16, unsigned char, predictor[384]);
@@ -200,11 +246,22 @@ typedef struct MacroBlockD
DECLARE_ALIGNED(16, short, dqcoeff[400]);
DECLARE_ALIGNED(16, char, eobs[25]);
#if CONFIG_OPENCL
cl_command_queue cl_commands; //Each macroblock gets its own command queue.
cl_mem cl_diff_mem;
cl_mem cl_predictor_mem;
cl_mem cl_qcoeff_mem;
cl_mem cl_dqcoeff_mem;
cl_mem cl_eobs_mem;
cl_bool sixtap_filter;
#endif
/* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
BLOCKD block[25];
YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
YV12_BUFFER_CONFIG dst;
YV12_BUFFER_CONFIG dst; /* Destination buffer for current frame */
MODE_INFO *mode_info_context;
int mode_info_stride;
@@ -252,11 +309,9 @@ typedef struct MacroBlockD
int mb_to_top_edge;
int mb_to_bottom_edge;
int ref_frame_cost[MAX_REF_FRAMES];
unsigned int frames_since_golden;
unsigned int frames_till_alt_ref_frame;
vp8_subpix_fn_t subpixel_predict;
vp8_subpix_fn_t subpixel_predict8x4;
vp8_subpix_fn_t subpixel_predict8x8;
@@ -266,14 +321,6 @@ typedef struct MacroBlockD
int corrupted;
#if ARCH_X86 || ARCH_X86_64
/* This is an intermediate buffer currently used in sub-pixel motion search
* to keep a copy of the reference area. This buffer can be used for other
* purpose.
*/
DECLARE_ALIGNED(32, unsigned char, y_buf[22*32]);
#endif
#if CONFIG_RUNTIME_CPU_DETECT
struct VP8_COMMON_RTCD *rtcd;
#endif
@@ -283,20 +330,4 @@ typedef struct MacroBlockD
extern void vp8_build_block_doffsets(MACROBLOCKD *x);
extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
static void update_blockd_bmi(MACROBLOCKD *xd)
{
int i;
int is_4x4;
is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||
(xd->mode_info_context->mbmi.mode == B_PRED);
if (is_4x4)
{
for (i = 0; i < 16; i++)
{
xd->block[i].bmi = xd->mode_info_context->bmi[i];
}
}
}
#endif /* __INC_BLOCKD_H */

View File

@@ -12,7 +12,7 @@
/* Update probabilities for the nodes in the token entropy tree.
Generated file included by entropy.c */
const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES] =
const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] =
{
{
{

View File

@@ -97,7 +97,7 @@ void vp8_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, int f
bindex = (b_row & 3) * 4 + (b_col & 3);
if (mi[mb_index].mbmi.mode == B_PRED)
fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].as_mode);
fprintf(mvs, "%2d ", mi[mb_index].bmi[bindex].mode);
else
fprintf(mvs, "xx ");

View File

@@ -1,225 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "defaultcoefcounts.h"
/* Generated file, included by entropy.c */
const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
[COEF_BANDS]
[PREV_COEF_CONTEXTS]
[MAX_ENTROPY_TOKENS] =
{
{
/* Block Type ( 0 ) */
{
/* Coeff Band ( 0 ) */
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
},
{
/* Coeff Band ( 1 ) */
{30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,},
{26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,},
{10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,},
},
{
/* Coeff Band ( 2 ) */
{25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,},
{9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,},
{1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,},
},
{
/* Coeff Band ( 3 ) */
{26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,},
{8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,},
{ 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,},
},
{
/* Coeff Band ( 4 ) */
{10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,},
{2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,},
{ 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,},
},
{
/* Coeff Band ( 5 ) */
{10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,},
{2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,},
{ 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,},
},
{
/* Coeff Band ( 6 ) */
{40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,},
{6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,},
{ 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,},
},
{
/* Coeff Band ( 7 ) */
{ 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,},
},
},
{
/* Block Type ( 1 ) */
{
/* Coeff Band ( 0 ) */
{3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,},
{8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,},
{9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,},
},
{
/* Coeff Band ( 1 ) */
{12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,},
{11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,},
{7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,},
},
{
/* Coeff Band ( 2 ) */
{15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,},
{7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,},
{1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,},
},
{
/* Coeff Band ( 3 ) */
{19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,},
{9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,},
{1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,},
},
{
/* Coeff Band ( 4 ) */
{12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,},
{4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,},
{ 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,},
},
{
/* Coeff Band ( 5 ) */
{12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,},
{4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,},
{ 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,},
},
{
/* Coeff Band ( 6 ) */
{61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,},
{15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,},
{ 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,},
},
{
/* Coeff Band ( 7 ) */
{ 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,},
{ 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,},
},
},
{
/* Block Type ( 2 ) */
{
/* Coeff Band ( 0 ) */
{ 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,},
{1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,},
{1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,},
},
{
/* Coeff Band ( 1 ) */
{1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,},
{1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,},
{1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,},
},
{
/* Coeff Band ( 2 ) */
{ 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,},
{ 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,},
{ 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,},
},
{
/* Coeff Band ( 3 ) */
{ 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,},
{ 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,},
},
{
/* Coeff Band ( 4 ) */
{ 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,},
{ 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,},
},
{
/* Coeff Band ( 5 ) */
{ 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,},
{ 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,},
},
{
/* Coeff Band ( 6 ) */
{ 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,},
{ 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,},
},
{
/* Coeff Band ( 7 ) */
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
},
},
{
/* Block Type ( 3 ) */
{
/* Coeff Band ( 0 ) */
{2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,},
{8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,},
{11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,},
},
{
/* Coeff Band ( 1 ) */
{9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,},
{12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,},
{10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,},
},
{
/* Coeff Band ( 2 ) */
{6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,},
{6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,},
{3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,},
},
{
/* Coeff Band ( 3 ) */
{11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,},
{9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,},
{4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,},
},
{
/* Coeff Band ( 4 ) */
{4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,},
{3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,},
{1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,},
},
{
/* Coeff Band ( 5 ) */
{8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,},
{3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,},
{1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,},
},
{
/* Coeff Band ( 6 ) */
{27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,},
{5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,},
{1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,},
},
{
/* Coeff Band ( 7 ) */
{ 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,},
{ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,},
},
},
};

View File

@@ -8,14 +8,214 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef __DEFAULTCOEFCOUNTS_H
#define __DEFAULTCOEFCOUNTS_H
#include "entropy.h"
/* Generated file, included by entropy.c */
extern const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
[COEF_BANDS]
[PREV_COEF_CONTEXTS]
[MAX_ENTROPY_TOKENS];
static const unsigned int default_coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens] =
{
#endif //__DEFAULTCOEFCOUNTS_H
{
/* Block Type ( 0 ) */
{
/* Coeff Band ( 0 ) */
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
},
{
/* Coeff Band ( 1 ) */
{30190, 26544, 225, 24, 4, 0, 0, 0, 0, 0, 0, 4171593,},
{26846, 25157, 1241, 130, 26, 6, 1, 0, 0, 0, 0, 149987,},
{10484, 9538, 1006, 160, 36, 18, 0, 0, 0, 0, 0, 15104,},
},
{
/* Coeff Band ( 2 ) */
{25842, 40456, 1126, 83, 11, 2, 0, 0, 0, 0, 0, 0,},
{9338, 8010, 512, 73, 7, 3, 2, 0, 0, 0, 0, 43294,},
{1047, 751, 149, 31, 13, 6, 1, 0, 0, 0, 0, 879,},
},
{
/* Coeff Band ( 3 ) */
{26136, 9826, 252, 13, 0, 0, 0, 0, 0, 0, 0, 0,},
{8134, 5574, 191, 14, 2, 0, 0, 0, 0, 0, 0, 35302,},
{ 605, 677, 116, 9, 1, 0, 0, 0, 0, 0, 0, 611,},
},
{
/* Coeff Band ( 4 ) */
{10263, 15463, 283, 17, 0, 0, 0, 0, 0, 0, 0, 0,},
{2773, 2191, 128, 9, 2, 2, 0, 0, 0, 0, 0, 10073,},
{ 134, 125, 32, 4, 0, 2, 0, 0, 0, 0, 0, 50,},
},
{
/* Coeff Band ( 5 ) */
{10483, 2663, 23, 1, 0, 0, 0, 0, 0, 0, 0, 0,},
{2137, 1251, 27, 1, 1, 0, 0, 0, 0, 0, 0, 14362,},
{ 116, 156, 14, 2, 1, 0, 0, 0, 0, 0, 0, 190,},
},
{
/* Coeff Band ( 6 ) */
{40977, 27614, 412, 28, 0, 0, 0, 0, 0, 0, 0, 0,},
{6113, 5213, 261, 22, 3, 0, 0, 0, 0, 0, 0, 26164,},
{ 382, 312, 50, 14, 2, 0, 0, 0, 0, 0, 0, 345,},
},
{
/* Coeff Band ( 7 ) */
{ 0, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 319,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8,},
},
},
{
/* Block Type ( 1 ) */
{
/* Coeff Band ( 0 ) */
{3268, 19382, 1043, 250, 93, 82, 49, 26, 17, 8, 25, 82289,},
{8758, 32110, 5436, 1832, 827, 668, 420, 153, 24, 0, 3, 52914,},
{9337, 23725, 8487, 3954, 2107, 1836, 1069, 399, 59, 0, 0, 18620,},
},
{
/* Coeff Band ( 1 ) */
{12419, 8420, 452, 62, 9, 1, 0, 0, 0, 0, 0, 0,},
{11715, 8705, 693, 92, 15, 7, 2, 0, 0, 0, 0, 53988,},
{7603, 8585, 2306, 778, 270, 145, 39, 5, 0, 0, 0, 9136,},
},
{
/* Coeff Band ( 2 ) */
{15938, 14335, 1207, 184, 55, 13, 4, 1, 0, 0, 0, 0,},
{7415, 6829, 1138, 244, 71, 26, 7, 0, 0, 0, 0, 9980,},
{1580, 1824, 655, 241, 89, 46, 10, 2, 0, 0, 0, 429,},
},
{
/* Coeff Band ( 3 ) */
{19453, 5260, 201, 19, 0, 0, 0, 0, 0, 0, 0, 0,},
{9173, 3758, 213, 22, 1, 1, 0, 0, 0, 0, 0, 9820,},
{1689, 1277, 276, 51, 17, 4, 0, 0, 0, 0, 0, 679,},
},
{
/* Coeff Band ( 4 ) */
{12076, 10667, 620, 85, 19, 9, 5, 0, 0, 0, 0, 0,},
{4665, 3625, 423, 55, 19, 9, 0, 0, 0, 0, 0, 5127,},
{ 415, 440, 143, 34, 20, 7, 2, 0, 0, 0, 0, 101,},
},
{
/* Coeff Band ( 5 ) */
{12183, 4846, 115, 11, 1, 0, 0, 0, 0, 0, 0, 0,},
{4226, 3149, 177, 21, 2, 0, 0, 0, 0, 0, 0, 7157,},
{ 375, 621, 189, 51, 11, 4, 1, 0, 0, 0, 0, 198,},
},
{
/* Coeff Band ( 6 ) */
{61658, 37743, 1203, 94, 10, 3, 0, 0, 0, 0, 0, 0,},
{15514, 11563, 903, 111, 14, 5, 0, 0, 0, 0, 0, 25195,},
{ 929, 1077, 291, 78, 14, 7, 1, 0, 0, 0, 0, 507,},
},
{
/* Coeff Band ( 7 ) */
{ 0, 990, 15, 3, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 412, 13, 0, 0, 0, 0, 0, 0, 0, 0, 1641,},
{ 0, 18, 7, 1, 0, 0, 0, 0, 0, 0, 0, 30,},
},
},
{
/* Block Type ( 2 ) */
{
/* Coeff Band ( 0 ) */
{ 953, 24519, 628, 120, 28, 12, 4, 0, 0, 0, 0, 2248798,},
{1525, 25654, 2647, 617, 239, 143, 42, 5, 0, 0, 0, 66837,},
{1180, 11011, 3001, 1237, 532, 448, 239, 54, 5, 0, 0, 7122,},
},
{
/* Coeff Band ( 1 ) */
{1356, 2220, 67, 10, 4, 1, 0, 0, 0, 0, 0, 0,},
{1450, 2544, 102, 18, 4, 3, 0, 0, 0, 0, 0, 57063,},
{1182, 2110, 470, 130, 41, 21, 0, 0, 0, 0, 0, 6047,},
},
{
/* Coeff Band ( 2 ) */
{ 370, 3378, 200, 30, 5, 4, 1, 0, 0, 0, 0, 0,},
{ 293, 1006, 131, 29, 11, 0, 0, 0, 0, 0, 0, 5404,},
{ 114, 387, 98, 23, 4, 8, 1, 0, 0, 0, 0, 236,},
},
{
/* Coeff Band ( 3 ) */
{ 579, 194, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 395, 213, 5, 1, 0, 0, 0, 0, 0, 0, 0, 4157,},
{ 119, 122, 4, 0, 0, 0, 0, 0, 0, 0, 0, 300,},
},
{
/* Coeff Band ( 4 ) */
{ 38, 557, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 21, 114, 12, 1, 0, 0, 0, 0, 0, 0, 0, 427,},
{ 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7,},
},
{
/* Coeff Band ( 5 ) */
{ 52, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 18, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 652,},
{ 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30,},
},
{
/* Coeff Band ( 6 ) */
{ 640, 569, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 25, 77, 2, 0, 0, 0, 0, 0, 0, 0, 0, 517,},
{ 4, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,},
},
{
/* Coeff Band ( 7 ) */
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
},
},
{
/* Block Type ( 3 ) */
{
/* Coeff Band ( 0 ) */
{2506, 20161, 2707, 767, 261, 178, 107, 30, 14, 3, 0, 100694,},
{8806, 36478, 8817, 3268, 1280, 850, 401, 114, 42, 0, 0, 58572,},
{11003, 27214, 11798, 5716, 2482, 2072, 1048, 175, 32, 0, 0, 19284,},
},
{
/* Coeff Band ( 1 ) */
{9738, 11313, 959, 205, 70, 18, 11, 1, 0, 0, 0, 0,},
{12628, 15085, 1507, 273, 52, 19, 9, 0, 0, 0, 0, 54280,},
{10701, 15846, 5561, 1926, 813, 570, 249, 36, 0, 0, 0, 6460,},
},
{
/* Coeff Band ( 2 ) */
{6781, 22539, 2784, 634, 182, 123, 20, 4, 0, 0, 0, 0,},
{6263, 11544, 2649, 790, 259, 168, 27, 5, 0, 0, 0, 20539,},
{3109, 4075, 2031, 896, 457, 386, 158, 29, 0, 0, 0, 1138,},
},
{
/* Coeff Band ( 3 ) */
{11515, 4079, 465, 73, 5, 14, 2, 0, 0, 0, 0, 0,},
{9361, 5834, 650, 96, 24, 8, 4, 0, 0, 0, 0, 22181,},
{4343, 3974, 1360, 415, 132, 96, 14, 1, 0, 0, 0, 1267,},
},
{
/* Coeff Band ( 4 ) */
{4787, 9297, 823, 168, 44, 12, 4, 0, 0, 0, 0, 0,},
{3619, 4472, 719, 198, 60, 31, 3, 0, 0, 0, 0, 8401,},
{1157, 1175, 483, 182, 88, 31, 8, 0, 0, 0, 0, 268,},
},
{
/* Coeff Band ( 5 ) */
{8299, 1226, 32, 5, 1, 0, 0, 0, 0, 0, 0, 0,},
{3502, 1568, 57, 4, 1, 1, 0, 0, 0, 0, 0, 9811,},
{1055, 1070, 166, 29, 6, 1, 0, 0, 0, 0, 0, 527,},
},
{
/* Coeff Band ( 6 ) */
{27414, 27927, 1989, 347, 69, 26, 0, 0, 0, 0, 0, 0,},
{5876, 10074, 1574, 341, 91, 24, 4, 0, 0, 0, 0, 21954,},
{1571, 2171, 778, 324, 124, 65, 16, 0, 0, 0, 0, 979,},
},
{
/* Coeff Band ( 7 ) */
{ 0, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,},
{ 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 459,},
{ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13,},
},
},
};

View File

@@ -26,32 +26,8 @@ typedef vp8_prob Prob;
#include "coefupdateprobs.h"
DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) =
{
0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
DECLARE_ALIGNED(16, cuchar, vp8_coef_bands[16]) =
{ 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
DECLARE_ALIGNED(16, cuchar, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) =
{ 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
DECLARE_ALIGNED(16, cuchar, vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7};
DECLARE_ALIGNED(16, cuchar, vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0};
DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) =
{
0, 1, 4, 8,
@@ -89,7 +65,7 @@ const vp8_tree_index vp8_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */
-DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */
};
struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens];
/* Trees for extra bits. Probabilities are constant and
do not depend on previously encoded bits */
@@ -169,12 +145,10 @@ void vp8_default_coef_probs(VP8_COMMON *pc)
do
{
unsigned int branch_ct [ENTROPY_NODES] [2];
unsigned int branch_ct [vp8_coef_tokens-1] [2];
vp8_tree_probs_from_distribution(
MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
pc->fc.coef_probs[h][i][k],
branch_ct,
vp8_default_coef_counts[h][i][k],
vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree,
pc->fc.coef_probs [h][i][k], branch_ct, default_coef_counts [h][i][k],
256, 1);
}

View File

@@ -30,12 +30,13 @@
#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */
#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */
#define MAX_ENTROPY_TOKENS 12
#define vp8_coef_tokens 12
#define MAX_ENTROPY_TOKENS vp8_coef_tokens
#define ENTROPY_NODES 11
extern const vp8_tree_index vp8_coef_tree[];
extern struct vp8_token_struct vp8_coef_encodings[MAX_ENTROPY_TOKENS];
extern struct vp8_token_struct vp8_coef_encodings[vp8_coef_tokens];
typedef struct
{
@@ -84,9 +85,9 @@ extern DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]);
/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */
# define PREV_COEF_CONTEXTS 3
extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[MAX_ENTROPY_TOKENS]);
extern DECLARE_ALIGNED(16, const unsigned char, vp8_prev_token_class[vp8_coef_tokens]);
extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
extern const vp8_prob vp8_coef_update_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
struct VP8Common;

View File

@@ -33,11 +33,11 @@ typedef enum
SUBMVREF_LEFT_ABOVE_ZED
} sumvfref_t;
int vp8_mv_cont(const int_mv *l, const int_mv *a)
int vp8_mv_cont(const MV *l, const MV *a)
{
int lez = (l->as_int == 0);
int aez = (a->as_int == 0);
int lea = (l->as_int == a->as_int);
int lez = (l->row == 0 && l->col == 0);
int aez = (a->row == 0 && a->col == 0);
int lea = (l->row == a->row && l->col == a->col);
if (lea && lez)
return SUBMVREF_LEFT_ABOVE_ZED;

View File

@@ -25,7 +25,7 @@ extern const int vp8_mbsplit_count [VP8_NUMMBSPLITS]; /* # of subsets */
extern const vp8_prob vp8_mbsplit_probs [VP8_NUMMBSPLITS-1];
extern int vp8_mv_cont(const int_mv *l, const int_mv *a);
extern int vp8_mv_cont(const MV *l, const MV *a);
#define SUBMVREF_COUNT 5
extern const vp8_prob vp8_sub_mv_ref_prob2 [SUBMVREF_COUNT][VP8_SUBMVREFS-1];

View File

@@ -85,10 +85,10 @@ void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src,
src->y_height, src->y_width,
et, el, eb, er);
et = dst->border >> 1;
el = dst->border >> 1;
eb = (dst->border >> 1) + dst->uv_height - src->uv_height;
er = (dst->border >> 1) + dst->uv_width - src->uv_width;
et = (et + 1) >> 1;
el = (el + 1) >> 1;
eb = (eb + 1) >> 1;
er = (er + 1) >> 1;
copy_and_extend_plane(src->u_buffer, src->uv_stride,
dst->u_buffer, dst->uv_stride,

View File

@@ -10,6 +10,29 @@
#include <stdlib.h>
#include <stdio.h>
#define REGISTER_FILTER 1
#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max;
#if REGISTER_FILTER
#define FILTER0 filter0
#define FILTER1 filter1
#define FILTER2 filter2
#define FILTER3 filter3
#define FILTER4 filter4
#define FILTER5 filter5
#else
#define FILTER0 vp8_filter[0]
#define FILTER1 vp8_filter[1]
#define FILTER2 vp8_filter[2]
#define FILTER3 vp8_filter[3]
#define FILTER4 vp8_filter[4]
#define FILTER5 vp8_filter[5]
#endif
#define SRC_INCREMENT src_increment
#include "filter.h"
#include "vpx_ports/mem.h"
@@ -27,7 +50,6 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters[8][2]) =
DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]) =
{
{ 0, 0, 128, 0, 0, 0 }, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
{ 0, -6, 123, 12, -1, 0 },
{ 2, -11, 108, 36, -8, 1 }, /* New 1/4 pel 6 tap filter */
@@ -49,35 +71,45 @@ static void filter_block2d_first_pass
const short *vp8_filter
)
{
unsigned int i, j;
int Temp;
#if REGISTER_FILTER
short filter0 = vp8_filter[0];
short filter1 = vp8_filter[1];
short filter2 = vp8_filter[2];
short filter3 = vp8_filter[3];
short filter4 = vp8_filter[4];
short filter5 = vp8_filter[5];
#endif
int ps2 = 2*(int)pixel_step;
int ps3 = 3*(int)pixel_step;
unsigned int src_increment = src_pixels_per_line - output_width;
for (i = 0; i < output_height; i++)
{
for (j = 0; j < output_width; j++)
{
Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
((int)src_ptr[0] * vp8_filter[2]) +
((int)src_ptr[pixel_step] * vp8_filter[3]) +
((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
Temp = ((int)src_ptr[-1*ps2] * FILTER0);
Temp += ((int)src_ptr[-1*(int)pixel_step] * FILTER1) +
((int)src_ptr[0] * FILTER2) +
((int)src_ptr[pixel_step] * FILTER3) +
((int)src_ptr[ps2] * FILTER4) +
((int)src_ptr[ps3] * FILTER5) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if (Temp > 255)
Temp = 255;
CLAMP(Temp, 0, 255);
output_ptr[j] = Temp;
src_ptr++;
}
/* Next row... */
src_ptr += src_pixels_per_line - output_width;
src_ptr += SRC_INCREMENT;
output_ptr += output_width;
}
}
@@ -97,33 +129,42 @@ static void filter_block2d_second_pass
unsigned int i, j;
int Temp;
#if REGISTER_FILTER
short filter0 = vp8_filter[0];
short filter1 = vp8_filter[1];
short filter2 = vp8_filter[2];
short filter3 = vp8_filter[3];
short filter4 = vp8_filter[4];
short filter5 = vp8_filter[5];
#endif
int ps2 = ((int)pixel_step) << 1;
int ps3 = ps2 + (int)pixel_step;
unsigned int src_increment = src_pixels_per_line - output_width;
for (i = 0; i < output_height; i++)
{
for (j = 0; j < output_width; j++)
{
/* Apply filter */
Temp = ((int)src_ptr[-2 * (int)pixel_step] * vp8_filter[0]) +
((int)src_ptr[-1 * (int)pixel_step] * vp8_filter[1]) +
((int)src_ptr[0] * vp8_filter[2]) +
((int)src_ptr[pixel_step] * vp8_filter[3]) +
((int)src_ptr[2*pixel_step] * vp8_filter[4]) +
((int)src_ptr[3*pixel_step] * vp8_filter[5]) +
Temp = ((int)src_ptr[-1*ps2] * FILTER0) +
((int)src_ptr[-1*(int)pixel_step] * FILTER1) +
((int)src_ptr[0] * FILTER2) +
((int)src_ptr[pixel_step] * FILTER3) +
((int)src_ptr[ps2] * FILTER4) +
((int)src_ptr[ps3] * FILTER5) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if (Temp > 255)
Temp = 255;
CLAMP(Temp, 0, 255);
output_ptr[j] = (unsigned char)Temp;
src_ptr++;
}
/* Start next row */
src_ptr += src_pixels_per_line - output_width;
src_ptr += src_increment;
output_ptr += output_pitch;
}
}
@@ -167,6 +208,7 @@ void vp8_sixtap_predict_c
filter_block2d(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter);
}
void vp8_sixtap_predict8x8_c
(
unsigned char *src_ptr,

View File

@@ -25,9 +25,9 @@ void vp8_find_near_mvs
(
MACROBLOCKD *xd,
const MODE_INFO *here,
int_mv *nearest,
int_mv *nearby,
int_mv *best_mv,
MV *nearest,
MV *nearby,
MV *best_mv,
int cnt[4],
int refframe,
int *ref_frame_sign_bias
@@ -131,14 +131,13 @@ void vp8_find_near_mvs
near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
/* Set up return values */
best_mv->as_int = near_mvs[0].as_int;
nearest->as_int = near_mvs[CNT_NEAREST].as_int;
nearby->as_int = near_mvs[CNT_NEAR].as_int;
*best_mv = near_mvs[0].as_mv;
*nearest = near_mvs[CNT_NEAREST].as_mv;
*nearby = near_mvs[CNT_NEAR].as_mv;
//TODO: move clamp outside findnearmv
vp8_clamp_mv2(nearest, xd);
vp8_clamp_mv2(nearby, xd);
vp8_clamp_mv2(best_mv, xd);
vp8_clamp_mv(nearest, xd);
vp8_clamp_mv(nearby, xd);
vp8_clamp_mv(best_mv, xd); /*TODO: move this up before the copy*/
}
vp8_prob *vp8_mv_ref_probs(
@@ -153,3 +152,26 @@ vp8_prob *vp8_mv_ref_probs(
return p;
}
const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b)
{
if (!(b & 3))
{
/* On L edge, get from MB to left of us */
--cur_mb;
b += 4;
}
return cur_mb->bmi + b - 1;
}
const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride)
{
if (!(b >> 2))
{
/* On top edge, get from MB above us */
cur_mb -= mi_stride;
b += 16;
}
return cur_mb->bmi + b - 4;
}

View File

@@ -17,6 +17,11 @@
#include "modecont.h"
#include "treecoder.h"
typedef union
{
unsigned int as_int;
MV as_mv;
} int_mv; /* facilitates rapid equality tests */
static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, const int *ref_frame_sign_bias)
{
@@ -34,48 +39,24 @@ static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, int_mv *mvp, co
#define LEFT_TOP_MARGIN (16 << 3)
#define RIGHT_BOTTOM_MARGIN (16 << 3)
static void vp8_clamp_mv2(int_mv *mv, const MACROBLOCKD *xd)
static void vp8_clamp_mv(MV *mv, const MACROBLOCKD *xd)
{
if (mv->as_mv.col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
mv->as_mv.col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
else if (mv->as_mv.col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
mv->as_mv.col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
if (mv->col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
mv->col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
else if (mv->col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
mv->col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
if (mv->as_mv.row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
mv->as_mv.row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
else if (mv->as_mv.row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
mv->as_mv.row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
}
static void vp8_clamp_mv(int_mv *mv, int mb_to_left_edge, int mb_to_right_edge,
int mb_to_top_edge, int mb_to_bottom_edge)
{
mv->as_mv.col = (mv->as_mv.col < mb_to_left_edge) ?
mb_to_left_edge : mv->as_mv.col;
mv->as_mv.col = (mv->as_mv.col > mb_to_right_edge) ?
mb_to_right_edge : mv->as_mv.col;
mv->as_mv.row = (mv->as_mv.row < mb_to_top_edge) ?
mb_to_top_edge : mv->as_mv.row;
mv->as_mv.row = (mv->as_mv.row > mb_to_bottom_edge) ?
mb_to_bottom_edge : mv->as_mv.row;
}
static unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
int mb_to_right_edge, int mb_to_top_edge,
int mb_to_bottom_edge)
{
unsigned int need_to_clamp;
need_to_clamp = (mv->as_mv.col < mb_to_left_edge) ? 1 : 0;
need_to_clamp |= (mv->as_mv.col > mb_to_right_edge) ? 1 : 0;
need_to_clamp |= (mv->as_mv.row < mb_to_top_edge) ? 1 : 0;
need_to_clamp |= (mv->as_mv.row > mb_to_bottom_edge) ? 1 : 0;
return need_to_clamp;
if (mv->row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
mv->row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
else if (mv->row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
mv->row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
}
void vp8_find_near_mvs
(
MACROBLOCKD *xd,
const MODE_INFO *here,
int_mv *nearest, int_mv *nearby, int_mv *best,
MV *nearest, MV *nearby, MV *best,
int near_mv_ref_cts[4],
int refframe,
int *ref_frame_sign_bias
@@ -85,89 +66,10 @@ vp8_prob *vp8_mv_ref_probs(
vp8_prob p[VP8_MVREFS-1], const int near_mv_ref_ct[4]
);
const B_MODE_INFO *vp8_left_bmi(const MODE_INFO *cur_mb, int b);
const B_MODE_INFO *vp8_above_bmi(const MODE_INFO *cur_mb, int b, int mi_stride);
extern const unsigned char vp8_mbsplit_offset[4][16];
static int left_block_mv(const MODE_INFO *cur_mb, int b)
{
if (!(b & 3))
{
/* On L edge, get from MB to left of us */
--cur_mb;
if(cur_mb->mbmi.mode != SPLITMV)
return cur_mb->mbmi.mv.as_int;
b += 4;
}
return (cur_mb->bmi + b - 1)->mv.as_int;
}
static int above_block_mv(const MODE_INFO *cur_mb, int b, int mi_stride)
{
if (!(b >> 2))
{
/* On top edge, get from MB above us */
cur_mb -= mi_stride;
if(cur_mb->mbmi.mode != SPLITMV)
return cur_mb->mbmi.mv.as_int;
b += 16;
}
return (cur_mb->bmi + b - 4)->mv.as_int;
}
static B_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b)
{
if (!(b & 3))
{
/* On L edge, get from MB to left of us */
--cur_mb;
switch (cur_mb->mbmi.mode)
{
case B_PRED:
return (cur_mb->bmi + b + 3)->as_mode;
case DC_PRED:
return B_DC_PRED;
case V_PRED:
return B_VE_PRED;
case H_PRED:
return B_HE_PRED;
case TM_PRED:
return B_TM_PRED;
default:
return B_DC_PRED;
}
}
return (cur_mb->bmi + b - 1)->as_mode;
}
static B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b, int mi_stride)
{
if (!(b >> 2))
{
/* On top edge, get from MB above us */
cur_mb -= mi_stride;
switch (cur_mb->mbmi.mode)
{
case B_PRED:
return (cur_mb->bmi + b + 12)->as_mode;
case DC_PRED:
return B_DC_PRED;
case V_PRED:
return B_VE_PRED;
case H_PRED:
return B_HE_PRED;
case TM_PRED:
return B_TM_PRED;
default:
return B_DC_PRED;
}
}
return (cur_mb->bmi + b - 4)->as_mode;
}
#endif

View File

@@ -17,53 +17,9 @@
#include "vp8/common/idct.h"
#include "vp8/common/onyxc_int.h"
#if CONFIG_MULTITHREAD
#if HAVE_UNISTD_H
#include <unistd.h>
#elif defined(_WIN32)
#include <windows.h>
typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO);
#endif
#endif
extern void vp8_arch_x86_common_init(VP8_COMMON *ctx);
extern void vp8_arch_arm_common_init(VP8_COMMON *ctx);
#if CONFIG_MULTITHREAD
static int get_cpu_count()
{
int core_count = 16;
#if HAVE_UNISTD_H
#if defined(_SC_NPROCESSORS_ONLN)
core_count = sysconf(_SC_NPROCESSORS_ONLN);
#elif defined(_SC_NPROC_ONLN)
core_count = sysconf(_SC_NPROC_ONLN);
#endif
#elif defined(_WIN32)
{
PGNSI pGNSI;
SYSTEM_INFO sysinfo;
/* Call GetNativeSystemInfo if supported or
* GetSystemInfo otherwise. */
pGNSI = (PGNSI) GetProcAddress(
GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo");
if (pGNSI != NULL)
pGNSI(&sysinfo);
else
GetSystemInfo(&sysinfo);
core_count = sysinfo.dwNumberOfProcessors;
}
#else
/* other platforms */
#endif
return core_count > 0 ? core_count : 1;
}
#endif
extern void vp8_arch_opencl_common_init(VP8_COMMON *ctx);
void vp8_machine_specific_config(VP8_COMMON *ctx)
{
@@ -88,12 +44,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
vp8_build_intra_predictors_mby;
rtcd->recon.build_intra_predictors_mby_s =
vp8_build_intra_predictors_mby_s;
rtcd->recon.build_intra_predictors_mbuv =
vp8_build_intra_predictors_mbuv;
rtcd->recon.build_intra_predictors_mbuv_s =
vp8_build_intra_predictors_mbuv_s;
rtcd->recon.intra4x4_predict =
vp8_intra4x4_predict;
rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c;
rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c;
@@ -108,12 +58,12 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_c;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_c;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_c;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_c;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_c;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_c;
#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS)
#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_PSNR)
rtcd->postproc.down = vp8_mbpost_proc_down_c;
rtcd->postproc.across = vp8_mbpost_proc_across_ip_c;
rtcd->postproc.downacross = vp8_post_proc_down_and_across_c;
@@ -133,7 +83,8 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
vp8_arch_arm_common_init(ctx);
#endif
#if CONFIG_MULTITHREAD
ctx->processor_core_count = get_cpu_count();
#endif /* CONFIG_MULTITHREAD */
#if CONFIG_OPENCL && (ENABLE_CL_IDCT_DEQUANT || ENABLE_CL_SUBPIXEL || ENABLE_CL_LOOPFILTER)
vp8_arch_opencl_common_init(ctx);
#endif
}

View File

@@ -31,6 +31,10 @@
#include "arm/idct_arm.h"
#endif
#if CONFIG_OPENCL
#include "opencl/idct_cl.h"
#endif
#ifndef vp8_idct_idct1
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_c
#endif

View File

@@ -9,149 +9,164 @@
*/
#include "vpx_config.h"
#include "vpx_ports/config.h"
#include "loopfilter.h"
#include "onyxc_int.h"
#include "vpx_mem/vpx_mem.h"
#if CONFIG_OPENCL
#include "opencl/loopfilter_cl.h"
#endif
typedef unsigned char uc;
prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
(void) simpler_lpf;
vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
(void) simpler_lpf;
vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
(void) simpler_lpf;
vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
(void) simpler_lpf;
vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
static void lf_init_lut(loop_filter_info_n *lfi)
void vp8_init_loop_filter(VP8_COMMON *cm)
{
int filt_lvl;
loop_filter_info *lfi = cm->lf_info;
LOOPFILTERTYPE lft = cm->filter_type;
int sharpness_lvl = cm->sharpness_level;
int frame_type = cm->frame_type;
int i, j;
for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
{
if (filt_lvl >= 40)
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
}
else if (filt_lvl >= 20)
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
}
else if (filt_lvl >= 15)
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
}
else
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
}
}
int block_inside_limit = 0;
int HEVThresh;
lfi->mode_lf_lut[DC_PRED] = 1;
lfi->mode_lf_lut[V_PRED] = 1;
lfi->mode_lf_lut[H_PRED] = 1;
lfi->mode_lf_lut[TM_PRED] = 1;
lfi->mode_lf_lut[B_PRED] = 0;
lfi->mode_lf_lut[ZEROMV] = 1;
lfi->mode_lf_lut[NEARESTMV] = 2;
lfi->mode_lf_lut[NEARMV] = 2;
lfi->mode_lf_lut[NEWMV] = 2;
lfi->mode_lf_lut[SPLITMV] = 3;
}
void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
int sharpness_lvl)
{
int i;
/* For each possible value for the loop filter fill out limits */
/* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
{
int filt_lvl = i;
int block_inside_limit = 0;
if (frame_type == KEY_FRAME)
{
if (filt_lvl >= 40)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
else
{
if (filt_lvl >= 40)
HEVThresh = 3;
else if (filt_lvl >= 20)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
/* Set loop filter paramaeters that control sharpness. */
block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
@@ -166,143 +181,177 @@ void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
if (block_inside_limit < 1)
block_inside_limit = 1;
vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
SIMD_WIDTH);
vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
SIMD_WIDTH);
for (j = 0; j < 16; j++)
{
lfi[i].lim[j] = block_inside_limit;
lfi[i].mbflim[j] = filt_lvl + 2;
lfi[i].flim[j] = filt_lvl;
lfi[i].thr[j] = HEVThresh;
}
}
/* Set up the function pointers depending on the type of loop filtering selected */
if (lft == NORMAL_LOOPFILTER)
{
cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v);
cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v);
cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h);
cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h);
}
else
{
cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v);
cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v);
cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h);
cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h);
}
}
void vp8_loop_filter_init(VP8_COMMON *cm)
{
loop_filter_info_n *lfi = &cm->lf_info;
int i;
/* init limits for given sharpness*/
vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
cm->last_sharpness_level = cm->sharpness_level;
/* init LUT for lvl and hev thr picking */
lf_init_lut(lfi);
/* init hev threshold const vectors */
for(i = 0; i < 4 ; i++)
{
vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
}
}
void vp8_loop_filter_frame_init(VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl)
{
int seg, /* segment number */
ref, /* index in ref_lf_deltas */
mode; /* index in mode_lf_deltas */
loop_filter_info_n *lfi = &cm->lf_info;
/* update limits if sharpness has changed */
if(cm->last_sharpness_level != cm->sharpness_level)
{
vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
cm->last_sharpness_level = cm->sharpness_level;
}
for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
{
int lvl_seg = default_filt_lvl;
int lvl_ref, lvl_mode;
/* Note the baseline filter values for each segment */
if (mbd->segmentation_enabled)
{
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
{
lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
}
else /* Delta Value */
{
lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
}
}
if (!mbd->mode_ref_lf_delta_enabled)
{
/* we could get rid of this if we assume that deltas are set to
* zero when not in use; encoder always uses deltas
/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
* each frame. Check last_frame_type to skip the function most of times.
*/
vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
continue;
void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
{
int HEVThresh;
int i, j;
/* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
{
int filt_lvl = i;
if (frame_type == KEY_FRAME)
{
if (filt_lvl >= 40)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
else
{
if (filt_lvl >= 40)
HEVThresh = 3;
else if (filt_lvl >= 20)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
lvl_ref = lvl_seg;
for (j = 0; j < 16; j++)
{
/*lfi[i].lim[j] = block_inside_limit;
lfi[i].mbflim[j] = filt_lvl+2;*/
/*lfi[i].flim[j] = filt_lvl;*/
lfi[i].thr[j] = HEVThresh;
}
}
}
/* INTRA_FRAME */
ref = INTRA_FRAME;
int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level)
{
MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;
if (mbd->mode_ref_lf_delta_enabled)
{
/* Apply delta for reference frame */
lvl_ref += mbd->ref_lf_deltas[ref];
filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
/* Apply delta for Intra modes */
mode = 0; /* B_PRED */
/* Apply delta for mode */
if (mbmi->ref_frame == INTRA_FRAME)
{
/* Only the split mode BPRED has a further special case */
lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
lfi->lvl[seg][ref][mode] = lvl_mode;
mode = 1; /* all the rest of Intra modes */
lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */
lfi->lvl[seg][ref][mode] = lvl_mode;
/* LAST, GOLDEN, ALT */
for(ref = 1; ref < MAX_REF_FRAMES; ref++)
if (mbmi->mode == B_PRED)
filter_level += mbd->mode_lf_deltas[0];
}
else
{
int lvl_ref = lvl_seg;
/* Zero motion mode */
if (mbmi->mode == ZEROMV)
filter_level += mbd->mode_lf_deltas[1];
/* Apply delta for reference frame */
lvl_ref += mbd->ref_lf_deltas[ref];
/* Split MB motion mode */
else if (mbmi->mode == SPLITMV)
filter_level += mbd->mode_lf_deltas[3];
/* Apply delta for Inter modes */
for (mode = 1; mode < 4; mode++)
{
lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
/* All other inter motion modes (Nearest, Near, New) */
else
filter_level += mbd->mode_lf_deltas[2];
}
lfi->lvl[seg][ref][mode] = lvl_mode;
}
}
/* Range check */
if (filter_level > MAX_LOOP_FILTER)
filter_level = MAX_LOOP_FILTER;
else if (filter_level < 0)
filter_level = 0;
}
return filter_level;
}
void vp8_loop_filter_frame
(
VP8_COMMON *cm,
MACROBLOCKD *mbd
MACROBLOCKD *mbd,
int default_filt_lvl
)
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
loop_filter_info *lfi = cm->lf_info;
FRAME_TYPE frame_type = cm->frame_type;
int mb_row;
int mb_col;
int filter_level;
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
int i;
unsigned char *y_ptr, *u_ptr, *v_ptr;
/* Point at base of Mb MODE_INFO list */
const MODE_INFO *mode_info_context = cm->mi;
#if CONFIG_OPENCL && ENABLE_CL_LOOPFILTER
if ( cl_initialized == CL_SUCCESS ){
vp8_loop_filter_frame_cl(cm,mbd,default_filt_lvl);
return;
}
#endif
mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */
/* Note the baseline filter values for each segment */
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
/* Delta Value */
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
else
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
baseline_filter_level[i] = default_filt_lvl;
}
/* Initialize the loop filter for this frame. */
vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
/* Set up the buffer pointers */
y_ptr = post->y_buffer;
@@ -314,108 +363,102 @@ void vp8_loop_filter_frame
{
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
filter_level = baseline_filter_level[Segment];
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
/* Distance of Mb to the various image edges.
* These specified to 8th pel as they are always compared to values that are in 1/8th pel units
* Apply any context driven MB level adjustment
*/
filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
if (filter_level)
{
if (cm->filter_type == NORMAL_LOOPFILTER)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
}
else
{
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
}
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
}
y_ptr += 16;
u_ptr += 8;
v_ptr += 8;
mode_info_context++; /* step to next MB */
mbd->mode_info_context++; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
u_ptr += post->uv_stride * 8 - post->uv_width;
v_ptr += post->uv_stride * 8 - post->uv_width;
mode_info_context++; /* Skip border mb */
mbd->mode_info_context++; /* Skip border mb */
}
}
/* Encoder only... */
void vp8_loop_filter_frame_yonly
(
VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl
int default_filt_lvl,
int sharpness_lvl
)
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
int i;
unsigned char *y_ptr;
int mb_row;
int mb_col;
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
loop_filter_info *lfi = cm->lf_info;
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
FRAME_TYPE frame_type = cm->frame_type;
/* Point at base of Mb MODE_INFO list */
const MODE_INFO *mode_info_context = cm->mi;
(void) sharpness_lvl;
#if 0
if(default_filt_lvl == 0) /* no filter applied */
return;
#endif
/*MODE_INFO * this_mb_mode_info = cm->mi;*/ /* Point at base of Mb MODE_INFO list */
mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */
/* Note the baseline filter values for each segment */
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
/* Delta Value */
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
else
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
baseline_filter_level[i] = default_filt_lvl;
}
/* Initialize the loop filter for this frame. */
vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl);
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
/* Set up the buffer pointers */
y_ptr = post->y_buffer;
@@ -425,106 +468,72 @@ void vp8_loop_filter_frame_yonly
{
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
filter_level = baseline_filter_level[Segment];
const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
/* Apply any context driven MB level adjustment */
filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
if (filter_level)
{
if (cm->filter_type == NORMAL_LOOPFILTER)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
}
else
{
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
}
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
}
y_ptr += 16;
mode_info_context ++; /* step to next MB */
mbd->mode_info_context ++; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
mode_info_context ++; /* Skip border mb */
mbd->mode_info_context ++; /* Skip border mb */
}
}
/* Encoder only... */
void vp8_loop_filter_partial_frame
(
VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl
int default_filt_lvl,
int sharpness_lvl,
int Fraction
)
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
int i;
unsigned char *y_ptr;
int mb_row;
int mb_col;
/*int mb_rows = post->y_height >> 4;*/
int mb_cols = post->y_width >> 4;
int linestocopy, i;
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
int linestocopy;
loop_filter_info *lfi = cm->lf_info;
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
FRAME_TYPE frame_type = cm->frame_type;
const MODE_INFO *mode_info_context;
(void) sharpness_lvl;
int lvl_seg[MAX_MB_SEGMENTS];
/*MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);*/ /* Point at base of Mb MODE_INFO list */
mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); /* Point at base of Mb MODE_INFO list */
mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
/* 3 is a magic number. 4 is probably magic too */
linestocopy = (post->y_height >> (4 + 3));
linestocopy = (post->y_height >> (4 + Fraction));
if (linestocopy < 1)
linestocopy = 1;
@@ -532,27 +541,32 @@ void vp8_loop_filter_partial_frame
linestocopy <<= 4;
/* Note the baseline filter values for each segment */
/* See vp8_loop_filter_frame_init. Rather than call that for each change
* to default_filt_lvl, copy the relevant calculation here.
*/
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{ /* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
{
lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
}
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
/* Delta Value */
else
{
lvl_seg[i] = default_filt_lvl
+ mbd->segment_feature_data[MB_LVL_ALT_LF][i];
lvl_seg[i] = (lvl_seg[i] > 0) ?
((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0;
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
else
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
baseline_filter_level[i] = default_filt_lvl;
}
/* Initialize the loop filter for this frame. */
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
/* Set up the buffer pointers */
y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
@@ -562,64 +576,28 @@ void vp8_loop_filter_partial_frame
{
for (mb_col = 0; mb_col < mb_cols; mb_col++)
{
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
if (alt_flt_enabled)
filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
else
filter_level = default_filt_lvl;
int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
filter_level = baseline_filter_level[Segment];
if (filter_level)
{
if (cm->filter_type == NORMAL_LOOPFILTER)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
}
else
{
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
}
if (mbd->mode_info_context->mbmi.dc_diff > 0)
cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0);
}
y_ptr += 16;
mode_info_context += 1; /* step to next MB */
mbd->mode_info_context += 1; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
mode_info_context += 1; /* Skip border mb */
mbd->mode_info_context += 1; /* Skip border mb */
}
}

View File

@@ -13,7 +13,6 @@
#define loopfilter_h
#include "vpx_ports/mem.h"
#include "vpx_config.h"
#define MAX_LOOP_FILTER 63
@@ -23,45 +22,26 @@ typedef enum
SIMPLE_LOOPFILTER = 1
} LOOPFILTERTYPE;
#if ARCH_ARM
#define SIMD_WIDTH 1
#else
#define SIMD_WIDTH 16
#endif
/* Need to align this structure so when it is declared and
/* FRK
* Need to align this structure so when it is declared and
* passed it can be loaded into vector registers.
*/
typedef struct
{
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
unsigned char lvl[4][4][4];
unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
unsigned char mode_lf_lut[10];
} loop_filter_info_n;
typedef struct
{
const unsigned char * mblim;
const unsigned char * blim;
const unsigned char * lim;
const unsigned char * hev_thr;
DECLARE_ALIGNED(16, signed char, lim[16]);
DECLARE_ALIGNED(16, signed char, flim[16]);
DECLARE_ALIGNED(16, signed char, thr[16]);
DECLARE_ALIGNED(16, signed char, mbflim[16]);
} loop_filter_info;
#define prototype_loopfilter(sym) \
void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
const unsigned char *limit, const unsigned char *thresh, int count)
void sym(unsigned char *src, int pitch, const signed char *flimit,\
const signed char *limit, const signed char *thresh, int count)
#define prototype_loopfilter_block(sym) \
void sym(unsigned char *y, unsigned char *u, unsigned char *v,\
int ystride, int uv_stride, loop_filter_info *lfi)
#define prototype_simple_loopfilter(sym) \
void sym(unsigned char *y, int ystride, const unsigned char *blimit)
int ystride, int uv_stride, loop_filter_info *lfi, int simpler)
#if ARCH_X86 || ARCH_X86_64
#include "x86/loopfilter_x86.h"
@@ -91,39 +71,38 @@ extern prototype_loopfilter_block(vp8_lf_normal_mb_h);
#endif
extern prototype_loopfilter_block(vp8_lf_normal_b_h);
#ifndef vp8_lf_simple_mb_v
#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_c
#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_c
#endif
extern prototype_simple_loopfilter(vp8_lf_simple_mb_v);
extern prototype_loopfilter_block(vp8_lf_simple_mb_v);
#ifndef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_c
#endif
extern prototype_simple_loopfilter(vp8_lf_simple_b_v);
extern prototype_loopfilter_block(vp8_lf_simple_b_v);
#ifndef vp8_lf_simple_mb_h
#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_c
#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_c
#endif
extern prototype_simple_loopfilter(vp8_lf_simple_mb_h);
extern prototype_loopfilter_block(vp8_lf_simple_mb_h);
#ifndef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_c
#endif
extern prototype_simple_loopfilter(vp8_lf_simple_b_h);
extern prototype_loopfilter_block(vp8_lf_simple_b_h);
typedef prototype_loopfilter_block((*vp8_lf_block_fn_t));
typedef prototype_simple_loopfilter((*vp8_slf_block_fn_t));
typedef struct
{
vp8_lf_block_fn_t normal_mb_v;
vp8_lf_block_fn_t normal_b_v;
vp8_lf_block_fn_t normal_mb_h;
vp8_lf_block_fn_t normal_b_h;
vp8_slf_block_fn_t simple_mb_v;
vp8_slf_block_fn_t simple_b_v;
vp8_slf_block_fn_t simple_mb_h;
vp8_slf_block_fn_t simple_b_h;
vp8_lf_block_fn_t simple_mb_v;
vp8_lf_block_fn_t simple_b_v;
vp8_lf_block_fn_t simple_mb_h;
vp8_lf_block_fn_t simple_b_h;
} vp8_loopfilter_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
@@ -136,33 +115,10 @@ typedef void loop_filter_uvfunction
(
unsigned char *u, /* source pointer */
int p, /* pitch */
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
unsigned char *v
);
/* assorted loopfilter functions which get used elsewhere */
struct VP8Common;
struct MacroBlockD;
void vp8_loop_filter_init(struct VP8Common *cm);
void vp8_loop_filter_frame_init(struct VP8Common *cm,
struct MacroBlockD *mbd,
int default_filt_lvl);
void vp8_loop_filter_frame(struct VP8Common *cm, struct MacroBlockD *mbd);
void vp8_loop_filter_partial_frame(struct VP8Common *cm,
struct MacroBlockD *mbd,
int default_filt_lvl);
void vp8_loop_filter_frame_yonly(struct VP8Common *cm,
struct MacroBlockD *mbd,
int default_filt_lvl);
void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
int sharpness_lvl);
#endif

View File

@@ -24,9 +24,8 @@ static __inline signed char vp8_signed_char_clamp(int t)
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
static __inline signed char vp8_filter_mask(uc limit, uc blimit,
uc p3, uc p2, uc p1, uc p0,
uc q0, uc q1, uc q2, uc q3)
static __inline signed char vp8_filter_mask(signed char limit, signed char flimit,
uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
{
signed char mask = 0;
mask |= (abs(p3 - p2) > limit) * -1;
@@ -35,13 +34,13 @@ static __inline signed char vp8_filter_mask(uc limit, uc blimit,
mask |= (abs(q1 - q0) > limit) * -1;
mask |= (abs(q2 - q1) > limit) * -1;
mask |= (abs(q3 - q2) > limit) * -1;
mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1;
mask = ~mask;
return mask;
}
/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
{
signed char hev = 0;
hev |= (abs(p1 - p0) > thresh) * -1;
@@ -49,9 +48,7 @@ static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
return hev;
}
static __inline void vp8_filter(signed char mask, uc hev, uc *op1,
uc *op0, uc *oq0, uc *oq1)
static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1)
{
signed char ps0, qs0;
signed char ps1, qs1;
@@ -96,13 +93,14 @@ static __inline void vp8_filter(signed char mask, uc hev, uc *op1,
*op1 = u ^ 0x80;
}
void vp8_loop_filter_horizontal_edge_c
(
unsigned char *s,
int p, /* pitch */
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count
)
{
@@ -115,11 +113,11 @@ void vp8_loop_filter_horizontal_edge_c
*/
do
{
mask = vp8_filter_mask(limit[0], blimit[0],
mask = vp8_filter_mask(limit[i], flimit[i],
s[-4*p], s[-3*p], s[-2*p], s[-1*p],
s[0*p], s[1*p], s[2*p], s[3*p]);
hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
@@ -132,9 +130,9 @@ void vp8_loop_filter_vertical_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count
)
{
@@ -147,10 +145,10 @@ void vp8_loop_filter_vertical_edge_c
*/
do
{
mask = vp8_filter_mask(limit[0], blimit[0],
mask = vp8_filter_mask(limit[i], flimit[i],
s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
@@ -159,7 +157,7 @@ void vp8_loop_filter_vertical_edge_c
while (++i < count * 8);
}
static __inline void vp8_mbfilter(signed char mask, uc hev,
static __inline void vp8_mbfilter(signed char mask, signed char hev,
uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
{
signed char s, u;
@@ -218,9 +216,9 @@ void vp8_mbloop_filter_horizontal_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count
)
{
@@ -234,11 +232,11 @@ void vp8_mbloop_filter_horizontal_edge_c
do
{
mask = vp8_filter_mask(limit[0], blimit[0],
mask = vp8_filter_mask(limit[i], flimit[i],
s[-4*p], s[-3*p], s[-2*p], s[-1*p],
s[0*p], s[1*p], s[2*p], s[3*p]);
hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
@@ -253,9 +251,9 @@ void vp8_mbloop_filter_vertical_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count
)
{
@@ -266,10 +264,10 @@ void vp8_mbloop_filter_vertical_edge_c
do
{
mask = vp8_filter_mask(limit[0], blimit[0],
mask = vp8_filter_mask(limit[i], flimit[i],
s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
@@ -280,13 +278,13 @@ void vp8_mbloop_filter_vertical_edge_c
}
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
static __inline signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1)
{
/* Why does this cause problems for win32?
* error C2143: syntax error : missing ';' before 'type'
* (void) limit;
*/
signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1;
signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1;
return mask;
}
@@ -319,37 +317,47 @@ void vp8_loop_filter_simple_horizontal_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count
)
{
signed char mask = 0;
int i = 0;
(void) thresh;
do
{
mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
/*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);*/
mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
++s;
}
while (++i < 16);
while (++i < count * 8);
}
void vp8_loop_filter_simple_vertical_edge_c
(
unsigned char *s,
int p,
const unsigned char *blimit
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count
)
{
signed char mask = 0;
int i = 0;
(void) thresh;
do
{
mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
/*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);*/
mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]);
vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
s += p;
}
while (++i < 16);
while (++i < count * 8);
}

View File

@@ -11,6 +11,12 @@
#include "blockd.h"
#include "stdio.h"
#include "vpx_config.h"
#if CONFIG_OPENCL
#include "opencl/vp8_opencl.h"
#endif
typedef enum
{
PRED = 0,
@@ -20,7 +26,6 @@ typedef enum
static void setup_block
(
BLOCKD *b,
int mv_stride,
unsigned char **base,
int Stride,
int offset,
@@ -49,81 +54,176 @@ static void setup_macroblock(MACROBLOCKD *x, BLOCKSET bs)
int block;
unsigned char **y, **u, **v;
unsigned char **buf_base;
int y_off, u_off, v_off;
if (bs == DEST)
{
buf_base = &x->dst.buffer_alloc;
y_off = x->dst.y_buffer - x->dst.buffer_alloc;
u_off = x->dst.u_buffer - x->dst.buffer_alloc;
v_off = x->dst.v_buffer - x->dst.buffer_alloc;
y = &x->dst.y_buffer;
u = &x->dst.u_buffer;
v = &x->dst.v_buffer;
y_off = 0;
//y = buf_base;
//y_off = x->dst.y_buffer - x->dst.buffer_alloc;
u = buf_base;
v = buf_base;
u_off = x->dst.u_buffer - x->dst.buffer_alloc;
v_off = x->dst.v_buffer - x->dst.buffer_alloc;
}
else
{
buf_base = &x->pre.buffer_alloc;
y = &x->pre.y_buffer;
u = &x->pre.u_buffer;
v = &x->pre.v_buffer;
y_off = u_off = v_off = 0;
//y = buf_base;
//y_off = x->pre.y_buffer - x->pre.buffer_alloc;
//u = buf_base;
//u_off = x->pre.u_buffer - x->pre.buffer_alloc;
//v = buf_base;
//v_off = x->pre.v_buffer - x->pre.buffer_alloc;
}
for (block = 0; block < 16; block++) /* y blocks */
{
setup_block(&x->block[block], x->dst.y_stride, y, x->dst.y_stride,
(block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4, bs);
setup_block(&x->block[block], y, x->dst.y_stride,
y_off + ((block >> 2) * 4 * x->dst.y_stride + (block & 3) * 4), bs);
}
for (block = 16; block < 20; block++) /* U and V blocks */
{
setup_block(&x->block[block], x->dst.uv_stride, u, x->dst.uv_stride,
((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
int block_off = ((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4;
setup_block(&x->block[block+4], x->dst.uv_stride, v, x->dst.uv_stride,
((block - 16) >> 1) * 4 * x->dst.uv_stride + (block & 1) * 4, bs);
setup_block(&x->block[block], u, x->dst.uv_stride,
u_off + block_off, bs);
setup_block(&x->block[block+4], v, x->dst.uv_stride,
v_off + block_off, bs);
}
}
void vp8_setup_block_dptrs(MACROBLOCKD *x)
{
int r, c;
unsigned int offset;
#if CONFIG_OPENCL && !ONE_CQ_PER_MB
cl_command_queue y_cq, u_cq, v_cq;
int err;
if (cl_initialized == CL_SUCCESS){
//Create command queue for Y/U/V Planes
y_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
if (!y_cq || err != CL_SUCCESS) {
printf("Error: Failed to create a command queue!\n");
cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
}
u_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
if (!u_cq || err != CL_SUCCESS) {
printf("Error: Failed to create a command queue!\n");
cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
}
v_cq = clCreateCommandQueue(cl_data.context, cl_data.device_id, 0, &err);
if (!v_cq || err != CL_SUCCESS) {
printf("Error: Failed to create a command queue!\n");
cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
}
}
#endif
/* 16 Y blocks */
for (r = 0; r < 4; r++)
{
for (c = 0; c < 4; c++)
{
x->block[r*4+c].diff = &x->diff[r * 4 * 16 + c * 4];
x->block[r*4+c].predictor = x->predictor + r * 4 * 16 + c * 4;
offset = r * 4 * 16 + c * 4;
x->block[r*4+c].diff_offset = offset;
x->block[r*4+c].predictor_offset = offset;
#if CONFIG_OPENCL && !ONE_CQ_PER_MB
if (cl_initialized == CL_SUCCESS)
x->block[r*4+c].cl_commands = y_cq;
#endif
}
}
/* 4 U Blocks */
for (r = 0; r < 2; r++)
{
for (c = 0; c < 2; c++)
{
x->block[16+r*2+c].diff = &x->diff[256 + r * 4 * 8 + c * 4];
x->block[16+r*2+c].predictor = x->predictor + 256 + r * 4 * 8 + c * 4;
offset = 256 + r * 4 * 8 + c * 4;
x->block[16+r*2+c].diff_offset = offset;
x->block[16+r*2+c].predictor_offset = offset;
#if CONFIG_OPENCL && !ONE_CQ_PER_MB
if (cl_initialized == CL_SUCCESS)
x->block[16+r*2+c].cl_commands = u_cq;
#endif
}
}
/* 4 V Blocks */
for (r = 0; r < 2; r++)
{
for (c = 0; c < 2; c++)
{
x->block[20+r*2+c].diff = &x->diff[320+ r * 4 * 8 + c * 4];
x->block[20+r*2+c].predictor = x->predictor + 320 + r * 4 * 8 + c * 4;
offset = 320+ r * 4 * 8 + c * 4;
x->block[20+r*2+c].diff_offset = offset;
x->block[20+r*2+c].predictor_offset = offset;
#if CONFIG_OPENCL && !ONE_CQ_PER_MB
if (cl_initialized == CL_SUCCESS)
x->block[20+r*2+c].cl_commands = v_cq;
#endif
}
}
x->block[24].diff = &x->diff[384];
x->block[24].diff_offset = 384;
for (r = 0; r < 25; r++)
{
x->block[r].qcoeff = x->qcoeff + r * 16;
x->block[r].dqcoeff = x->dqcoeff + r * 16;
x->block[r].qcoeff_base = x->qcoeff;
x->block[r].qcoeff_offset = r * 16;
x->block[r].dqcoeff_base = x->dqcoeff;
x->block[r].dqcoeff_offset = r * 16;
x->block[r].predictor_base = x->predictor;
x->block[r].diff_base = x->diff;
x->block[r].eobs_base = x->eobs;
#if CONFIG_OPENCL
if (cl_initialized == CL_SUCCESS){
/* Copy command queue reference from macroblock */
#if ONE_CQ_PER_MB
x->block[r].cl_commands = x->cl_commands;
#endif
/* Set up CL memory buffers as appropriate */
x->block[r].cl_diff_mem = x->cl_diff_mem;
x->block[r].cl_dqcoeff_mem = x->cl_dqcoeff_mem;
x->block[r].cl_eobs_mem = x->cl_eobs_mem;
x->block[r].cl_predictor_mem = x->cl_predictor_mem;
x->block[r].cl_qcoeff_mem = x->cl_qcoeff_mem;
}
//Copy filter type to block.
x->block[r].sixtap_filter = x->sixtap_filter;
#endif
}
}
void vp8_build_block_doffsets(MACROBLOCKD *x)
{
/* handle the destination pitch features */
setup_macroblock(x, DEST);
setup_macroblock(x, PRED);

View File

@@ -11,7 +11,6 @@
#ifndef __INC_MV_H
#define __INC_MV_H
#include "vpx/vpx_integer.h"
typedef struct
{
@@ -19,10 +18,4 @@ typedef struct
short col;
} MV;
typedef union
{
uint32_t as_int;
MV as_mv;
} int_mv; /* facilitates faster equality tests and copies */
#endif

View File

@@ -109,7 +109,6 @@ extern "C"
int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0
int Sharpness; // parameter used for sharpening output: recommendation 0:
int cpu_used;
unsigned int rc_max_intra_bitrate_pct;
// mode ->
//(0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing
@@ -140,9 +139,8 @@ extern "C"
int end_usage; // vbr or cbr
// buffer targeting aggressiveness
// shoot to keep buffer full at all times by undershooting a bit 95 recommended
int under_shoot_pct;
int over_shoot_pct;
// buffering parameters
int starting_buffer_level; // in seconds
@@ -184,11 +182,8 @@ extern "C"
int token_partitions; // how many token partitions to create for multi core decoding
int encode_breakout; // early breakout encode threshold : for video conf recommend 800
unsigned int error_resilient_mode; // Bitfield defining the error
// resiliency features to enable. Can provide
// decodable frames after losses in previous
// frames and decodable partitions after
// losses in the same frame.
int error_resilient_mode; // if running over udp networks provides decodable frames after a
// dropped packet
int arnr_max_frames;
int arnr_strength ;
@@ -211,8 +206,8 @@ extern "C"
// receive a frames worth of data caller can assume that a copy of this frame is made
// and not just a copy of the pointer..
int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp);
int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, int64_t *time_stamp, int64_t *time_end, int flush);
int vp8_receive_raw_frame(VP8_PTR comp, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time_stamp);
int vp8_get_compressed_data(VP8_PTR comp, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush);
int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags);
int vp8_use_as_reference(VP8_PTR comp, int ref_frame_flags);

View File

@@ -19,9 +19,7 @@
#include "entropy.h"
#include "idct.h"
#include "recon.h"
#if CONFIG_POSTPROC
#include "postproc.h"
#endif
/*#ifdef PACKET_TESTING*/
#include "header.h"
@@ -37,15 +35,13 @@ void vp8_initialize_common(void);
#define NUM_YV12_BUFFERS 4
#define MAX_PARTITIONS 9
typedef struct frame_contexts
{
vp8_prob bmode_prob [VP8_BINTRAMODES-1];
vp8_prob ymode_prob [VP8_YMODES-1]; /* interframe intra mode probs */
vp8_prob uv_mode_prob [VP8_UV_MODES-1];
vp8_prob sub_mv_ref_prob [VP8_SUBMVREFS-1];
vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [ENTROPY_NODES];
vp8_prob coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
MV_CONTEXT mvc[2];
MV_CONTEXT pre_mvc[2]; /* not to caculate the mvcost for the frame if mvc doesn't change. */
} FRAME_CONTEXT;
@@ -77,9 +73,7 @@ typedef struct VP8_COMMON_RTCD
vp8_recon_rtcd_vtable_t recon;
vp8_subpix_rtcd_vtable_t subpix;
vp8_loopfilter_rtcd_vtable_t loopfilter;
#if CONFIG_POSTPROC
vp8_postproc_rtcd_vtable_t postproc;
#endif
int flags;
#else
int unused;
@@ -87,7 +81,6 @@ typedef struct VP8_COMMON_RTCD
} VP8_COMMON_RTCD;
typedef struct VP8Common
{
struct vpx_internal_error_info error;
@@ -112,8 +105,7 @@ typedef struct VP8Common
YV12_BUFFER_CONFIG post_proc_buffer;
YV12_BUFFER_CONFIG temp_scale_frame;
FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */
FRAME_TYPE last_frame_type; /* Save last frame's frame type for loopfilter init checking and motion search. */
FRAME_TYPE frame_type;
int show_frame;
@@ -127,7 +119,7 @@ typedef struct VP8Common
/* profile settings */
int mb_no_coeff_skip;
int no_lpf;
int use_bilinear_mc_filter;
int simpler_lpf;
int full_pixel;
int base_qindex;
@@ -147,15 +139,16 @@ typedef struct VP8Common
MODE_INFO *mip; /* Base of allocated array */
MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */
MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
INTERPOLATIONFILTERTYPE mcomp_filter_type;
LOOPFILTERTYPE last_filter_type;
LOOPFILTERTYPE filter_type;
loop_filter_info_n lf_info;
loop_filter_info lf_info[MAX_LOOP_FILTER+1];
prototype_loopfilter_block((*lf_mbv));
prototype_loopfilter_block((*lf_mbh));
prototype_loopfilter_block((*lf_bv));
prototype_loopfilter_block((*lf_bh));
int filter_level;
int last_sharpness_level;
int sharpness_level;
@@ -202,12 +195,13 @@ typedef struct VP8Common
#if CONFIG_RUNTIME_CPU_DETECT
VP8_COMMON_RTCD rtcd;
#endif
#if CONFIG_MULTITHREAD
int processor_core_count;
#endif
#if CONFIG_POSTPROC
struct postproc_state postproc_state;
#endif
} VP8_COMMON;
int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level);
void vp8_init_loop_filter(VP8_COMMON *cm);
void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type);
extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val);
#endif

View File

@@ -18,12 +18,10 @@
extern "C"
{
#endif
#include "vpx/vpx_codec.h"
#include "type_aliases.h"
#include "vpx_scale/yv12config.h"
#include "ppflags.h"
#include "vpx_ports/mem.h"
#include "vpx/vpx_codec.h"
typedef void *VP8D_PTR;
typedef struct
@@ -33,8 +31,6 @@ extern "C"
int Version;
int postprocess;
int max_threads;
int error_concealment;
int input_partition;
} VP8D_CONFIG;
typedef enum
{
@@ -54,11 +50,11 @@ extern "C"
int vp8dx_get_setting(VP8D_PTR comp, VP8D_SETTING oxst);
int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, int64_t time_stamp);
int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, int64_t *time_stamp, int64_t *time_end_stamp, vp8_ppflags_t *flags);
int vp8dx_receive_compressed_data(VP8D_PTR comp, unsigned long size, const unsigned char *dest, INT64 time_stamp);
int vp8dx_get_raw_frame(VP8D_PTR comp, YV12_BUFFER_CONFIG *sd, INT64 *time_stamp, INT64 *time_end_stamp, vp8_ppflags_t *flags);
vpx_codec_err_t vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
vpx_codec_err_t vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
int vp8dx_get_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
int vp8dx_set_reference(VP8D_PTR comp, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd);
VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf);

View File

@@ -0,0 +1,233 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "../../decoder/onyxd_int.h"
#include "../../../vpx_ports/config.h"
#include "../../common/idct.h"
#include "blockd_cl.h"
#include "../../decoder/opencl/dequantize_cl.h"
int vp8_cl_mb_prep(MACROBLOCKD *x, int flags){
int err;
if (cl_initialized != CL_SUCCESS){
return cl_initialized;
}
//Copy all blockd.cl_*_mem objects
if (flags & DIFF)
VP8_CL_SET_BUF(x->cl_commands, x->cl_diff_mem, sizeof(cl_short)*400, x->diff,
,err
);
if (flags & PREDICTOR)
VP8_CL_SET_BUF(x->cl_commands, x->cl_predictor_mem, sizeof(cl_uchar)*384, x->predictor,
,err
);
if (flags & QCOEFF)
VP8_CL_SET_BUF(x->cl_commands, x->cl_qcoeff_mem, sizeof(cl_short)*400, x->qcoeff,
,err
);
if (flags & DQCOEFF)
VP8_CL_SET_BUF(x->cl_commands, x->cl_dqcoeff_mem, sizeof(cl_short)*400, x->dqcoeff,
,err
);
if (flags & EOBS)
VP8_CL_SET_BUF(x->cl_commands, x->cl_eobs_mem, sizeof(cl_char)*25, x->eobs,
,err
);
if (flags & PRE_BUF){
VP8_CL_SET_BUF(x->cl_commands, x->pre.buffer_mem, x->pre.buffer_size, x->pre.buffer_alloc,
,err
);
}
if (flags & DST_BUF){
VP8_CL_SET_BUF(x->cl_commands, x->dst.buffer_mem, x->dst.buffer_size, x->dst.buffer_alloc,
,err
);
}
return CL_SUCCESS;
}
int vp8_cl_mb_finish(MACROBLOCKD *x, int flags){
int err;
if (cl_initialized != CL_SUCCESS){
return cl_initialized;
}
if (flags & DIFF){
err = clEnqueueReadBuffer(x->cl_commands, x->cl_diff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->diff, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & PREDICTOR){
err = clEnqueueReadBuffer(x->cl_commands, x->cl_predictor_mem, CL_FALSE, 0, sizeof(cl_uchar)*384, x->predictor, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & QCOEFF){
err = clEnqueueReadBuffer(x->cl_commands, x->cl_qcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->qcoeff, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & DQCOEFF){
err = clEnqueueReadBuffer(x->cl_commands, x->cl_dqcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, x->dqcoeff, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & EOBS){
err = clEnqueueReadBuffer(x->cl_commands, x->cl_eobs_mem, CL_FALSE, 0, sizeof(cl_char)*25, x->eobs, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & PRE_BUF){
err = clEnqueueReadBuffer(x->cl_commands, x->pre.buffer_mem, CL_FALSE,
0, x->pre.buffer_size, x->pre.buffer_alloc, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & DST_BUF){
err = clEnqueueReadBuffer(x->cl_commands, x->dst.buffer_mem, CL_FALSE,
0, x->dst.buffer_size, x->dst.buffer_alloc, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( x->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
return CL_SUCCESS;
}
int vp8_cl_block_prep(BLOCKD *b, int flags){
int err;
if (cl_initialized != CL_SUCCESS){
return cl_initialized;
}
//Copy all blockd.cl_*_mem objects
if (flags & DIFF)
VP8_CL_SET_BUF(b->cl_commands, b->cl_diff_mem, sizeof(cl_short)*400, b->diff_base,
,err
);
if (flags & PREDICTOR)
VP8_CL_SET_BUF(b->cl_commands, b->cl_predictor_mem, sizeof(cl_uchar)*384, b->predictor_base,
,err
);
if (flags & QCOEFF)
VP8_CL_SET_BUF(b->cl_commands, b->cl_qcoeff_mem, sizeof(cl_short)*400, b->qcoeff_base,
,err
);
if (flags & DQCOEFF)
VP8_CL_SET_BUF(b->cl_commands, b->cl_dqcoeff_mem, sizeof(cl_short)*400, b->dqcoeff_base,
,err
);
if (flags & EOBS)
VP8_CL_SET_BUF(b->cl_commands, b->cl_eobs_mem, sizeof(cl_char)*25, b->eobs_base,
,err
);
if (flags & DEQUANT)
VP8_CL_SET_BUF(b->cl_commands, b->cl_dequant_mem, sizeof(cl_short)*16 ,b->dequant,
,err
);
return CL_SUCCESS;
}
int vp8_cl_block_finish(BLOCKD *b, int flags){
int err;
if (cl_initialized != CL_SUCCESS){
return cl_initialized;
}
if (flags & DIFF){
err = clEnqueueReadBuffer(b->cl_commands, b->cl_diff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->diff_base, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & PREDICTOR){
err = clEnqueueReadBuffer(b->cl_commands, b->cl_predictor_mem, CL_FALSE, 0, sizeof(cl_uchar)*384, b->predictor_base, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & QCOEFF){
err = clEnqueueReadBuffer(b->cl_commands, b->cl_qcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->qcoeff_base, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & DQCOEFF){
err = clEnqueueReadBuffer(b->cl_commands, b->cl_dqcoeff_mem, CL_FALSE, 0, sizeof(cl_short)*400, b->dqcoeff_base, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & EOBS){
err = clEnqueueReadBuffer(b->cl_commands, b->cl_eobs_mem, CL_FALSE, 0, sizeof(cl_char)*25, b->eobs_base, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
if (flags & DEQUANT){
err = clEnqueueReadBuffer(b->cl_commands, b->cl_dequant_mem, CL_FALSE, 0, sizeof(cl_short)*16 ,b->dequant, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to read from GPU!\n",
, err
);
}
return CL_SUCCESS;
}

View File

@@ -0,0 +1,64 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef BLOCKD_OPENCL_H
#define BLOCKD_OPENCL_H
#ifdef __cplusplus
extern "C" {
#endif
#include "vp8_opencl.h"
#include "../blockd.h"
#define DIFF 0x0001
#define PREDICTOR 0x0002
#define QCOEFF 0x0004
#define DQCOEFF 0x0008
#define EOBS 0x0010
#define DEQUANT 0x0020
#define PRE_BUF 0x0040
#define DST_BUF 0x0080
#define BLOCK_COPY_ALL 0xffff
/*
#define BLOCK_MEM_SIZE 6
enum {
DIFF_MEM = 0,
PRED_MEM = 1,
QCOEFF_MEM = 2,
DQCOEFF_MEM = 3,
EOBS_MEM = 4,
DEQUANT_MEM = 5
} BLOCK_MEM_TYPES;
struct cl_block_mem{
cl_mem gpu_mem;
size_t size;
void *host_mem;
};
typedef struct cl_block_mem block_mem;
*/
extern int vp8_cl_block_finish(BLOCKD *b, int flags);
extern int vp8_cl_block_prep(BLOCKD *b, int flags);
extern int vp8_cl_mb_prep(MACROBLOCKD *x, int flags);
extern int vp8_cl_mb_finish(MACROBLOCKD *x, int flags);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,106 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vp8_opencl.h"
#include <stdio.h>
CL_FUNCTIONS cl;
void *dll = NULL;
int cl_loaded = VP8_CL_NOT_INITIALIZED;
int close_cl(){
int ret = dlclose(dll);
if (ret != 0)
fprintf(stderr, "Error closing OpenCL library: %s", dlerror());
return ret;
}
int load_cl(char *lib_name){
//printf("Loading OpenCL library\n");
dll = dlopen(lib_name, RTLD_NOW|RTLD_LOCAL);
if (dll != NULL){
//printf("Found CL library\n");
} else {
//printf("Didn't find CL library\n");
return VP8_CL_TRIED_BUT_FAILED;
}
CL_LOAD_FN("clGetPlatformIDs", cl.getPlatformIDs);
CL_LOAD_FN("clGetPlatformInfo", cl.getPlatformInfo);
CL_LOAD_FN("clGetDeviceIDs", cl.getDeviceIDs);
CL_LOAD_FN("clGetDeviceInfo", cl.getDeviceInfo);
CL_LOAD_FN("clCreateContext", cl.createContext);
// CL_LOAD_FN("clCreateContextFromType", cl.createContextFromType);
// CL_LOAD_FN("clRetainContext", cl.retainContext);
CL_LOAD_FN("clReleaseContext", cl.releaseContext);
// CL_LOAD_FN("clGetContextInfo", cl.getContextInfo);
CL_LOAD_FN("clCreateCommandQueue", cl.createCommandQueue);
// CL_LOAD_FN("clRetainCommandQueue", cl.retainCommandQueue);
CL_LOAD_FN("clReleaseCommandQueue", cl.releaseCommandQueue);
// CL_LOAD_FN("clGetCommandQueueInfo", cl.getCommandQueue);
CL_LOAD_FN("clCreateBuffer", cl.createBuffer);
// CL_LOAD_FN("clCreateImage2D", cl.createImage2D);
// CL_LOAD_FN("clCreateImage3D", cl.createImage3D);
// CL_LOAD_FN("clRetainMemObject", cl.retainMemObject);
CL_LOAD_FN("clReleaseMemObject", cl.releaseMemObject);
// CL_LOAD_FN("clGetSupportedImageFormats", cl.getSupportedImageFormats);
// CL_LOAD_FN("clGetMemObjectInfo", cl.getMemObjectInfo);
// CL_LOAD_FN("clGetImageInfo", cl.getImageInfo);
// CL_LOAD_FN("clCreateSampler", cl.createSampler);
// CL_LOAD_FN("clRetainSampler", cl.retainSampler);
// CL_LOAD_FN("clReleaseSampler", cl.releaseSampler);
// CL_LOAD_FN("clGetSamplerInfo", cl.getSamplerInfo);
CL_LOAD_FN("clCreateProgramWithSource", cl.createProgramWithSource);
// CL_LOAD_FN("clCreateProgramWithBinary", cl.createProgramWithBinary);
// CL_LOAD_FN("clRetainProgram", cl.retainProgram);
CL_LOAD_FN("clReleaseProgram", cl.releaseProgram);
CL_LOAD_FN("clBuildProgram", cl.buildProgram);
// CL_LOAD_FN("clUnloadCompiler", cl.unloadCompiler);
CL_LOAD_FN("clGetProgramInfo", cl.getProgramInfo);
CL_LOAD_FN("clGetProgramBuildInfo", cl.getProgramBuildInfo);
CL_LOAD_FN("clCreateKernel", cl.createKernel);
// CL_LOAD_FN("clCreateKernelsInProgram", cl.createKernelsInProgram);
// CL_LOAD_FN("clRetainKernel", cl.retainKernel);
CL_LOAD_FN("clReleaseKernel", cl.releaseKernel);
CL_LOAD_FN("clSetKernelArg", cl.setKernelArg);
// CL_LOAD_FN("clGetKernelInfo", cl.getKernelInfo);
CL_LOAD_FN("clGetKernelWorkGroupInfo", cl.getKernelWorkGroupInfo);
// CL_LOAD_FN("clWaitForEvents", cl.waitForEvents);
// CL_LOAD_FN("clGetEventInfo", cl.getEventInfo);
// CL_LOAD_FN("clRetainEvent", cl.retainEvent);
// CL_LOAD_FN("clReleaseEvent", cl.releaseEvent);
// CL_LOAD_FN("clGetEventProfilingInfo", cl.getEventProfilingInfo);
CL_LOAD_FN("clFlush", cl.flush);
CL_LOAD_FN("clFinish", cl.finish);
CL_LOAD_FN("clEnqueueReadBuffer", cl.enqueueReadBuffer);
CL_LOAD_FN("clEnqueueWriteBuffer", cl.enqueueWriteBuffer);
CL_LOAD_FN("clEnqueueCopyBuffer", cl.enqueueCopyBuffer);
// CL_LOAD_FN("clEnqueueReadImage", cl.enqueueReadImage);
// CL_LOAD_FN("clEnqueueWriteImage", cl.enqueueWriteImage);
// CL_LOAD_FN("clEnqueueCopyImage", cl.enqueueCopyImage);
// CL_LOAD_FN("clEnqueueCopyImageToBuffer", cl.enqueueCopyImageToBuffer);
// CL_LOAD_FN("clEnqueueCopyBufferToImage", cl.enqueueCopyBufferToImage);
// CL_LOAD_FN("clEnqueueMapBuffer", cl.enqueueMapBuffer);
// CL_LOAD_FN("clEnqueueMapImage", cl.enqueueMapImage);
// CL_LOAD_FN("clEnqueueUnmapMemObject", cl.enqueueUnmapMemObject);
CL_LOAD_FN("clEnqueueNDRangeKernel", cl.enqueueNDRAngeKernel);
// CL_LOAD_FN("clEnqueueTask", cl.enqueueTask);
// CL_LOAD_FN("clEnqueueNativeKernel", cl.enqueueNativeKernel);
// CL_LOAD_FN("clEnqueueMarker", cl.enqueueMarker);
// CL_LOAD_FN("clEnqueueWaitForEvents", cl.enqueueWaitForEvents);
CL_LOAD_FN("clEnqueueBarrier", cl.enqueueBarrier);
// CL_LOAD_FN("clGetExtensionFunctionAddress", cl.getExtensionFunctionAddress);
return CL_SUCCESS;
}

View File

@@ -0,0 +1,253 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef DYNAMIC_CL_H
#define DYNAMIC_CL_H
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#include <dlfcn.h>
int load_cl(char *lib_name);
int close_cl();
extern int cl_loaded;
typedef cl_int(*fn_clGetPlatformIDs_t)(cl_uint, cl_platform_id *, cl_uint *);
typedef cl_int(*fn_clGetPlatformInfo_t)(cl_platform_id, cl_platform_info, size_t, void *, size_t *);
typedef cl_int(*fn_clGetDeviceIDs_t)(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *);
typedef cl_int(*fn_clGetDeviceInfo_t)(cl_device_id, cl_device_info, size_t, void *, size_t *);
typedef cl_context(*fn_clCreateContext_t)(const cl_context_properties *, cl_uint, const cl_device_id *, void (*pfn_notify)(const char *, const void *, size_t, void *), void *, cl_int *);
typedef cl_context(*fn_clCreateContextFromType_t)(const cl_context_properties *, cl_device_type, void (*pfn_notify)(const char *, const void *, size_t, void *), void *, cl_int *);
typedef cl_int(*fn_clRetainContext_t)(cl_context);
typedef cl_int(*fn_clReleaseContext_t)(cl_context);
typedef cl_int(*fn_clGetContextInfo_t)(cl_context, cl_context_info, size_t, void *, size_t *);
typedef cl_command_queue(*fn_clCreateCommandQueue_t)(cl_context, cl_device_id, cl_command_queue_properties, cl_int *);
typedef cl_int(*fn_clRetainCommandQueue_t)(cl_command_queue);
typedef cl_int(*fn_clReleaseCommandQueue_t)(cl_command_queue);
typedef cl_int(*fn_clGetCommandQueueInfo_t)(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *);
typedef cl_mem(*fn_clCreateBuffer_t)(cl_context, cl_mem_flags, size_t, void *, cl_int *);
typedef cl_mem(*fn_clCreateImage2D_t)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, void *, cl_int *);
typedef cl_mem(*fn_clCreateImage3D_t)(cl_context, cl_mem_flags, const cl_image_format *, size_t, size_t, size_t, size_t, size_t, void *, cl_int *);
typedef cl_int(*fn_clRetainMemObject_t)(cl_mem);
typedef cl_int(*fn_clReleaseMemObject_t)(cl_mem);
typedef cl_int(*fn_clGetSupportedImageFormats_t)(cl_context, cl_mem_flags, cl_mem_object_type, cl_uint, cl_image_format *, cl_uint *);
typedef cl_int(*fn_clGetMemObjectInfo_t)(cl_mem, cl_mem_info, size_t, void *, size_t *);
typedef cl_int(*fn_clGetImageInfo_t)(cl_mem, cl_image_info, size_t, void *, size_t *);
typedef cl_sampler(*fn_clCreateSampler_t)(cl_context, cl_bool, cl_addressing_mode, cl_filter_mode, cl_int *);
typedef cl_int(*fn_clRetainSampler_t)(cl_sampler);
typedef cl_int(*fn_clReleaseSampler_t)(cl_sampler);
typedef cl_int(*fn_clGetSamplerInfo_t)(cl_sampler, cl_sampler_info, size_t, void *, size_t *);
typedef cl_program(*fn_clCreateProgramWithSource_t)(cl_context, cl_uint, const char **, const size_t *, cl_int *);
typedef cl_program(*fn_clCreateProgramWithBinary_t)(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *);
typedef cl_int(*fn_clRetainProgram_t)(cl_program);
typedef cl_int(*fn_clReleaseProgram_t)(cl_program);
typedef cl_int(*fn_clBuildProgram_t)(cl_program, cl_uint, const cl_device_id *, const char *, void (*pfn_notify)(cl_program,void*), void *);
typedef cl_int(*fn_clUnloadCompiler_t)(void);
typedef cl_int(*fn_clGetProgramInfo_t)(cl_program, cl_program_info, size_t, void *, size_t *);
typedef cl_int(*fn_clGetProgramBuildInfo_t)(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *);
typedef cl_kernel(*fn_clCreateKernel_t)(cl_program, const char *, cl_int *);
typedef cl_int(*fn_clCreateKernelsInProgram_t)(cl_program, cl_uint, cl_kernel *, cl_uint *);
typedef cl_int(*fn_clRetainKernel_t)(cl_kernel);
typedef cl_int(*fn_clReleaseKernel_t)(cl_kernel);
typedef cl_int(*fn_clSetKernelArg_t)(cl_kernel, cl_uint, size_t, const void *);
typedef cl_int(*fn_clGetKernelInfo_t)(cl_kernel, cl_kernel_info, size_t, void *, size_t *);
typedef cl_int(*fn_clGetKernelWorkGroupInfo_t)(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *);
typedef cl_int(*fn_clWaitForEvents_t)(cl_uint, const cl_event *);
typedef cl_int(*fn_clGetEventInfo_t)(cl_event, cl_event_info, size_t, void *, size_t *);
typedef cl_int(*fn_clRetainEvent_t)(cl_event);
typedef cl_int(*fn_clReleaseEvent_t)(cl_event);
typedef cl_int(*fn_clGetEventProfilingInfo_t)(cl_event, cl_profiling_info, size_t, void *, size_t *);
typedef cl_int(*fn_clFlush_t)(cl_command_queue);
typedef cl_int(*fn_clFinish_t)(cl_command_queue);
typedef cl_int(*fn_clEnqueueReadBuffer_t)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueWriteBuffer_t)(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueCopyBuffer_t)(cl_command_queue, cl_mem, cl_mem, size_t, size_t, size_t, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueReadImage_t)(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueWriteImage_t)(cl_command_queue, cl_mem, cl_bool, const size_t *, const size_t *, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueCopyImage_t)(cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueCopyImageToBuffer_t)(cl_command_queue, cl_mem, cl_mem, const size_t *, const size_t *, size_t, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueCopyBufferToImage_t)(cl_command_queue, cl_mem, cl_mem, size_t, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
typedef void*(*fn_clEnqueueMapBuffer_t)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, size_t, size_t, cl_uint, const cl_event *, cl_event *, cl_int *);
typedef void*(*fn_clEnqueueMapImage_t)(cl_command_queue, cl_mem, cl_bool, cl_map_flags, const size_t *, const size_t *, size_t *, size_t *, cl_uint, const cl_event *, cl_event *, cl_int *);
typedef cl_int(*fn_clEnqueueUnmapMemObject_t)(cl_command_queue, cl_mem, void *, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueNDRangeKernel_t)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueTask_t)(cl_command_queue, cl_kernel, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueNativeKernel_t)(cl_command_queue, void (*user_func)(void *), void *, size_t, cl_uint, const cl_mem *, const void **, cl_uint, const cl_event *, cl_event *);
typedef cl_int(*fn_clEnqueueMarker_t)(cl_command_queue, cl_event *);
typedef cl_int(*fn_clEnqueueWaitForEvents_t)(cl_command_queue, cl_uint, const cl_event *);
typedef cl_int(*fn_clEnqueueBarrier_t)(cl_command_queue);
typedef void*(*fn_clGetExtensionFunctionAddress_t)(const char *);
typedef struct CL_FUNCTIONS {
fn_clGetPlatformIDs_t getPlatformIDs;
fn_clGetPlatformInfo_t getPlatformInfo;
fn_clGetDeviceIDs_t getDeviceIDs;
fn_clGetDeviceInfo_t getDeviceInfo;
fn_clCreateContext_t createContext;
fn_clCreateContextFromType_t createContextFromType;
fn_clRetainContext_t retainContext;
fn_clReleaseContext_t releaseContext;
fn_clGetContextInfo_t getContextInfo;
fn_clCreateCommandQueue_t createCommandQueue;
fn_clRetainCommandQueue_t retainCommandQueue;
fn_clReleaseCommandQueue_t releaseCommandQueue;
fn_clGetCommandQueueInfo_t getCommandQueue;
fn_clCreateBuffer_t createBuffer;
fn_clCreateImage2D_t createImage2D;
fn_clCreateImage3D_t createImage3D;
fn_clRetainMemObject_t retainMemObject;
fn_clReleaseMemObject_t releaseMemObject;
fn_clGetSupportedImageFormats_t getSupportedImageFormats;
fn_clGetMemObjectInfo_t getMemObjectInfo;
fn_clGetImageInfo_t getImageInfo;
fn_clCreateSampler_t createSampler;
fn_clRetainSampler_t retainSampler;
fn_clReleaseSampler_t releaseSampler;
fn_clGetSamplerInfo_t getSamplerInfo;
fn_clCreateProgramWithSource_t createProgramWithSource;
fn_clCreateProgramWithBinary_t createProgramWithBinary;
fn_clRetainProgram_t retainProgram;
fn_clReleaseProgram_t releaseProgram;
fn_clBuildProgram_t buildProgram;
fn_clUnloadCompiler_t unloadCompiler;
fn_clGetProgramInfo_t getProgramInfo;
fn_clGetProgramBuildInfo_t getProgramBuildInfo;
fn_clCreateKernel_t createKernel;
fn_clCreateKernelsInProgram_t createKernelsInProgram;
fn_clRetainKernel_t retainKernel;
fn_clReleaseKernel_t releaseKernel;
fn_clSetKernelArg_t setKernelArg;
fn_clGetKernelInfo_t getKernelInfo;
fn_clGetKernelWorkGroupInfo_t getKernelWorkGroupInfo;
fn_clWaitForEvents_t waitForEvents;
fn_clGetEventInfo_t getEventInfo;
fn_clRetainEvent_t retainEvent;
fn_clReleaseEvent_t releaseEvent;
fn_clGetEventProfilingInfo_t getEventProfilingInfo;
fn_clFlush_t flush;
fn_clFinish_t finish;
fn_clEnqueueReadBuffer_t enqueueReadBuffer;
fn_clEnqueueWriteBuffer_t enqueueWriteBuffer;
fn_clEnqueueCopyBuffer_t enqueueCopyBuffer;
fn_clEnqueueReadImage_t enqueueReadImage;
fn_clEnqueueWriteImage_t enqueueWriteImage;
fn_clEnqueueCopyImage_t enqueueCopyImage;
fn_clEnqueueCopyImageToBuffer_t enqueueCopyImageToBuffer;
fn_clEnqueueCopyBufferToImage_t enqueueCopyBufferToImage;
fn_clEnqueueMapBuffer_t enqueueMapBuffer;
fn_clEnqueueMapImage_t enqueueMapImage;
fn_clEnqueueUnmapMemObject_t enqueueUnmapMemObject;
fn_clEnqueueNDRangeKernel_t enqueueNDRAngeKernel;
fn_clEnqueueTask_t enqueueTask;
fn_clEnqueueNativeKernel_t enqueueNativeKernel;
fn_clEnqueueMarker_t enqueueMarker;
fn_clEnqueueWaitForEvents_t enqueueWaitForEvents;
fn_clEnqueueBarrier_t enqueueBarrier;
fn_clGetExtensionFunctionAddress_t getExtensionFunctionAddress;
} CL_FUNCTIONS;
extern CL_FUNCTIONS cl;
#define clGetPlatformIDs cl.getPlatformIDs
#define clGetPlatformInfo cl.getPlatformInfo
#define clGetDeviceIDs cl.getDeviceIDs
#define clGetDeviceInfo cl.getDeviceInfo
#define clCreateContext cl.createContext
#define clCreateContextFromType cl.createContextFromType
#define clRetainContext cl.retainContext
#define clReleaseContext cl.releaseContext
#define clGetContextInfo cl.getContextInfo
#define clCreateCommandQueue cl.createCommandQueue
#define clRetainCommandQueue cl.retainCommandQueue
#define clReleaseCommandQueue cl.releaseCommandQueue
#define clGetCommandQueueInfo cl.getCommandQueue
#define clCreateBuffer cl.createBuffer
#define clCreateSubBuffer cl.createSubBuffer
#define clCreateImage2D cl.createImage2D
#define clCreateImage3D cl.createImage3D
#define clRetainMemObject cl.retainMemObject
#define clReleaseMemObject cl.releaseMemObject
#define clGetSupportedImageFormats cl.getSupportedImageFormats
#define clGetMemObjectInfo cl.getMemObjectInfo
#define clGetImageInfo cl.getImageInfo
#define clSetMemObjectDestructorCallback cl.setMemObjectDestructorCallback
#define clCreateSampler cl.createSampler
#define clRetainSampler cl.retainSampler
#define clReleaseSampler cl.releaseSampler
#define clGetSamplerInfo cl.getSamplerInfo
#define clCreateProgramWithSource cl.createProgramWithSource
#define clCreateProgramWithBinary cl.createProgramWithBinary
#define clRetainProgram cl.retainProgram
#define clReleaseProgram cl.releaseProgram
#define clBuildProgram cl.buildProgram
#define clUnloadCompiler cl.unloadCompiler
#define clGetProgramInfo cl.getProgramInfo
#define clGetProgramBuildInfo cl.getProgramBuildInfo
#define clCreateKernel cl.createKernel
#define clCreateKernelsInProgram cl.createKernelsInProgram
#define clRetainKernel cl.retainKernel
#define clReleaseKernel cl.releaseKernel
#define clSetKernelArg cl.setKernelArg
#define clGetKernelInfo cl.getKernelInfo
#define clGetKernelWorkGroupInfo cl.getKernelWorkGroupInfo
#define clWaitForEvents cl.waitForEvents
#define clGetEventInfo cl.getEventInfo
#define clCreateUserEvent cl.createUserEvent
#define clRetainEvent cl.retainEvent
#define clReleaseEvent cl.releaseEvent
#define clSetUserEventStatus cl.setUserEventStatus
#define clSetEventCallback cl.setEventCallback
#define clGetEventProfilingInfo cl.getEventProfilingInfo
#define clFlush cl.flush
#define clFinish cl.finish
#define clEnqueueReadBuffer cl.enqueueReadBuffer
#define clEnqueueReadBufferRect cl.enqueueReadBufferRect
#define clEnqueueWriteBuffer cl.enqueueWriteBuffer
#define clEnqueueWriteBufferRect cl.enqueueWriteBufferRect
#define clEnqueueCopyBuffer cl.enqueueCopyBuffer
#define clEnqueueCopyBufferRect cl.enqueueCopyBufferRect
#define clEnqueueReadImage cl.enqueueReadImage
#define clEnqueueWriteImage cl.enqueueWriteImage
#define clEnqueueCopyImage cl.enqueueCopyImage
#define clEnqueueCopyImageToBuffer cl.enqueueCopyImageToBuffer
#define clEnqueueCopyBufferToImage cl.enqueueCopyBufferToImage
#define clEnqueueMapBuffer cl.enqueueMapBuffer
#define clEnqueueMapImage cl.enqueueMapImage
#define clEnqueueUnmapMemObject cl.enqueueUnmapMemObject
#define clEnqueueNDRangeKernel cl.enqueueNDRAngeKernel
#define clEnqueueTask cl.enqueueTask
#define clEnqueueNativeKernel cl.enqueueNativeKernel
#define clEnqueueMarker cl.enqueueMarker
#define clEnqueueWaitForEvents cl.enqueueWaitForEvents
#define clEnqueueBarrier cl.enqueueBarrier
#define clGetExtensionFunctionAddress cl.getExtensionFunctionAddress
#define CL_LOAD_FN(name, ref) \
ref = dlsym(dll,name); \
if (ref == NULL){ \
dlclose(dll); \
return CL_INVALID_PLATFORM; \
}
#ifdef __cplusplus
}
#endif
#endif /* DYNAMIC_CL_H */

View File

@@ -0,0 +1,824 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
//ACW: Remove me after debugging.
#include <stdio.h>
#include <string.h>
#include "vp8_opencl.h"
#include "filter_cl.h"
#include "../blockd.h"
#define SIXTAP_FILTER_LEN 6
const char *filterCompileOptions = "-Ivp8/common/opencl -DVP8_FILTER_WEIGHT=128 -DVP8_FILTER_SHIFT=7 -DFILTER_OFFSET";
const char *filter_cl_file_name = "vp8/common/opencl/filter_cl.cl";
#define STATIC_MEM 1
#if STATIC_MEM
static cl_mem int_mem = NULL;
#endif
void cl_destroy_filter(){
if (cl_data.filter_program)
clReleaseProgram(cl_data.filter_program);
//VP8_CL_RELEASE_KERNEL(cl_data.vp8_block_variation_kernel);
#if !TWO_PASS_SIXTAP
VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict8x8_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict8x4_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_sixtap_predict16x16_kernel);
#else
VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_first_pass_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_second_pass_kernel);
#endif
//VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict4x4_kernel);
//VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict8x4_kernel);
//VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict8x8_kernel);
//VP8_CL_RELEASE_KERNEL(cl_data.vp8_bilinear_predict16x16_kernel);
#if MEM_COPY_KERNEL
VP8_CL_RELEASE_KERNEL(cl_data.vp8_memcpy_kernel);
#endif
VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_bil_first_pass_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_filter_block2d_bil_second_pass_kernel);
#if STATIC_MEM
if (int_mem != NULL)
clReleaseMemObject(int_mem);
int_mem = NULL;
#endif
cl_data.filter_program = NULL;
}
int cl_init_filter() {
int err;
// Create the filter compute program from the file-defined source code
if ( cl_load_program(&cl_data.filter_program, filter_cl_file_name,
filterCompileOptions) != CL_SUCCESS )
return VP8_CL_TRIED_BUT_FAILED;
// Create the compute kernel in the program we wish to run
#if TWO_PASS_SIXTAP
VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_first_pass_kernel,"vp8_filter_block2d_first_pass_kernel");
VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_second_pass_kernel,"vp8_filter_block2d_second_pass_kernel");
VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_first_pass_kernel,vp8_filter_block2d_first_pass_kernel_size);
VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_second_pass_kernel,vp8_filter_block2d_second_pass_kernel_size);
#else
VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict_kernel,"vp8_sixtap_predict_kernel");
VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict_kernel,vp8_sixtap_predict_kernel_size);
VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict8x8_kernel,"vp8_sixtap_predict8x8_kernel");
VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict8x8_kernel,vp8_sixtap_predict8x8_kernel_size);
VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict8x4_kernel,"vp8_sixtap_predict8x4_kernel");
VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict8x4_kernel,vp8_sixtap_predict8x4_kernel_size);
VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_sixtap_predict16x16_kernel,"vp8_sixtap_predict16x16_kernel");
VP8_CL_CALC_LOCAL_SIZE(vp8_sixtap_predict16x16_kernel,vp8_sixtap_predict16x16_kernel_size);
#endif
//VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_bil_first_pass_kernel,vp8_filter_block2d_bil_first_pass_kernel_size);
//VP8_CL_CALC_LOCAL_SIZE(vp8_filter_block2d_bil_second_pass_kernel,vp8_filter_block2d_bil_second_pass_kernel_size);
VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_bil_first_pass_kernel,"vp8_filter_block2d_bil_first_pass_kernel");
VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_filter_block2d_bil_second_pass_kernel,"vp8_filter_block2d_bil_second_pass_kernel");
//VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict4x4_kernel,"vp8_bilinear_predict4x4_kernel");
//VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict8x4_kernel,"vp8_bilinear_predict8x4_kernel");
//VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict8x8_kernel,"vp8_bilinear_predict8x8_kernel");
//VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_bilinear_predict16x16_kernel,"vp8_bilinear_predict16x16_kernel");
#if MEM_COPY_KERNEL
VP8_CL_CREATE_KERNEL(cl_data,filter_program,vp8_memcpy_kernel,"vp8_memcpy_kernel");
VP8_CL_CALC_LOCAL_SIZE(vp8_memcpy_kernel,vp8_memcpy_kernel_size);
#endif
#if STATIC_MEM
VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,err);
#endif
return CL_SUCCESS;
}
void vp8_filter_block2d_first_pass_cl(
cl_command_queue cq,
cl_mem src_mem,
int src_offset,
cl_mem int_mem,
unsigned int src_pixels_per_line,
unsigned int int_height,
unsigned int int_width,
int xoffset
){
int err;
size_t global = int_width*int_height;
size_t local = cl_data.vp8_filter_block2d_first_pass_kernel_size;
if (local > global)
local = global;
err = clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 0, sizeof (cl_mem), &src_mem);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 1, sizeof (int), &src_offset);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 2, sizeof (cl_mem), &int_mem);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 3, sizeof (cl_uint), &src_pixels_per_line);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 4, sizeof (cl_uint), &int_height);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 5, sizeof (cl_int), &int_width);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_first_pass_kernel, 6, sizeof (int), &xoffset);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
,
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_first_pass_kernel, 1, NULL, &global, &local , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);,
);
}
void vp8_filter_block2d_second_pass_cl(
cl_command_queue cq,
cl_mem int_mem,
int int_offset,
cl_mem dst_mem,
int dst_offset,
int dst_pitch,
unsigned int output_height,
unsigned int output_width,
int yoffset
){
int err;
size_t global = output_width*output_height;
size_t local = cl_data.vp8_filter_block2d_second_pass_kernel_size;
if (local > global){
//printf("Local is now %ld\n",global);
local = global;
}
/* Set kernel arguments */
err = clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 0, sizeof (cl_mem), &int_mem);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 1, sizeof (int), &int_offset);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 2, sizeof (cl_mem), &dst_mem);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 3, sizeof (int), &dst_offset);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 4, sizeof (int), &dst_pitch);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 5, sizeof (int), &output_width);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 6, sizeof (int), &output_width);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 7, sizeof (int), &output_height);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 8, sizeof (int), &output_width);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_second_pass_kernel, 9, sizeof (int), &yoffset);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
,
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_second_pass_kernel, 1, NULL, &global, &local , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);,
);
}
void vp8_sixtap_single_pass(
cl_command_queue cq,
cl_kernel kernel,
size_t local,
size_t global,
cl_mem src_mem,
cl_mem dst_mem,
unsigned char *src_base,
int src_offset,
size_t src_len,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
int dst_offset,
int dst_pitch,
size_t dst_len
){
int err;
#if !STATIC_MEM
cl_mem int_mem;
#endif
int free_src = 0, free_dst = 0;
if (local > global){
local = global;
}
/* Make space for kernel input/output data.
* Initialize the buffer as well if needed.
*/
if (src_mem == NULL){
VP8_CL_CREATE_BUF( cq, src_mem,, sizeof (unsigned char) * src_len, src_base-2,,);
src_offset = 2;
free_src = 1;
} else {
src_offset -= 2*src_pixels_per_line;
}
if (dst_mem == NULL){
VP8_CL_CREATE_BUF( cq, dst_mem,, sizeof (unsigned char) * dst_len + dst_offset, dst_base,, );
free_dst = 1;
}
#if !STATIC_MEM
CL_CREATE_BUF( cq, int_mem,, sizeof(cl_int)*FData_height*FData_width, NULL,, );
#endif
err = clSetKernelArg(kernel, 0, sizeof (cl_mem), &src_mem);
err |= clSetKernelArg(kernel, 1, sizeof (int), &src_offset);
err |= clSetKernelArg(kernel, 2, sizeof (cl_int), &src_pixels_per_line);
err |= clSetKernelArg(kernel, 3, sizeof (cl_int), &xoffset);
err |= clSetKernelArg(kernel, 4, sizeof (cl_int), &yoffset);
err |= clSetKernelArg(kernel, 5, sizeof (cl_mem), &dst_mem);
err |= clSetKernelArg(kernel, 6, sizeof (cl_int), &dst_offset);
err |= clSetKernelArg(kernel, 7, sizeof (int), &dst_pitch);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
,
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel( cq, kernel, 1, NULL, &global, &local , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);,
);
if (free_src == 1)
clReleaseMemObject(src_mem);
if (free_dst == 1){
/* Read back the result data from the device */
err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to read output array!\n",
,
);
clReleaseMemObject(dst_mem);
}
}
void vp8_sixtap_run_cl(
cl_command_queue cq,
cl_mem src_mem,
cl_mem dst_mem,
unsigned char *src_base,
int src_offset,
size_t src_len,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
int dst_offset,
int dst_pitch,
size_t dst_len,
unsigned int FData_height,
unsigned int FData_width,
unsigned int output_height,
unsigned int output_width,
int int_offset
)
{
int err;
#if !STATIC_MEM
cl_mem int_mem;
#endif
int free_src = 0, free_dst = 0;
/* Make space for kernel input/output data.
* Initialize the buffer as well if needed.
*/
if (src_mem == NULL){
VP8_CL_CREATE_BUF( cq, src_mem,, sizeof (unsigned char) * src_len, src_base-2,,);
src_offset = 2;
free_src = 1;
} else {
src_offset -= 2*src_pixels_per_line;
}
if (dst_mem == NULL){
VP8_CL_CREATE_BUF( cq, dst_mem,, sizeof (unsigned char) * dst_len + dst_offset, dst_base,, );
free_dst = 1;
}
#if !STATIC_MEM
CL_CREATE_BUF( cq, int_mem,, sizeof(cl_int)*FData_height*FData_width, NULL,, );
#endif
vp8_filter_block2d_first_pass_cl(
cq, src_mem, src_offset, int_mem, src_pixels_per_line,
FData_height, FData_width, xoffset
);
vp8_filter_block2d_second_pass_cl(cq,int_mem,int_offset,dst_mem,dst_offset,dst_pitch,
output_height,output_width,yoffset);
if (free_src == 1)
clReleaseMemObject(src_mem);
if (free_dst == 1){
/* Read back the result data from the device */
err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to read output array!\n",
,
);
clReleaseMemObject(dst_mem);
}
#if !STATIC_MEM
clReleaseMemObject(int_mem);
#endif
}
void vp8_sixtap_predict4x4_cl
(
cl_command_queue cq,
unsigned char *src_base,
cl_mem src_mem,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
cl_mem dst_mem,
int dst_offset,
int dst_pitch
) {
int output_width=4, output_height=4, FData_height=9, FData_width=4;
//Size of output to transfer
int dst_len = DST_LEN(dst_pitch,output_height,output_width);
int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
#if TWO_PASS_SIXTAP
int int_offset = 8;
unsigned char *src_ptr = src_base + src_offset;
vp8_sixtap_run_cl(cq, src_mem, dst_mem,
(src_ptr-2*src_pixels_per_line),src_offset, src_len,
src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
dst_pitch,dst_len,FData_height,FData_width,output_height,
output_width,int_offset
);
#else
vp8_sixtap_single_pass(
cq,
cl_data.vp8_sixtap_predict_kernel,
cl_data.vp8_sixtap_predict_kernel_size,
FData_height*FData_width,
src_mem,
dst_mem,
src_base,
src_offset,
src_len,
src_pixels_per_line,
xoffset,
yoffset,
dst_base,
dst_offset,
dst_pitch,
dst_len
);
#endif
return;
}
void vp8_sixtap_predict8x8_cl
(
cl_command_queue cq,
unsigned char *src_base,
cl_mem src_mem,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
cl_mem dst_mem,
int dst_offset,
int dst_pitch
) {
int output_width=8, output_height=8, FData_height=13, FData_width=8;
//Size of output to transfer
int dst_len = DST_LEN(dst_pitch,output_height,output_width);
int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
#if TWO_PASS_SIXTAP
int int_offset = 16;
unsigned char *src_ptr = src_base + src_offset;
vp8_sixtap_run_cl(cq, src_mem, dst_mem,
(src_ptr-2*src_pixels_per_line),src_offset, src_len,
src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
dst_pitch,dst_len,FData_height,FData_width,output_height,
output_width,int_offset
);
#else
vp8_sixtap_single_pass(
cq,
cl_data.vp8_sixtap_predict8x8_kernel,
cl_data.vp8_sixtap_predict8x8_kernel_size,
FData_height*FData_width,
src_mem,
dst_mem,
src_base,
src_offset,
src_len,
src_pixels_per_line,
xoffset,
yoffset,
dst_base,
dst_offset,
dst_pitch,
dst_len
);
#endif
return;
}
void vp8_sixtap_predict8x4_cl
(
cl_command_queue cq,
unsigned char *src_base,
cl_mem src_mem,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
cl_mem dst_mem,
int dst_offset,
int dst_pitch
) {
int output_width=8, output_height=4, FData_height=9, FData_width=8;
//Size of output to transfer
int dst_len = DST_LEN(dst_pitch,output_height,output_width);
int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
#if TWO_PASS_SIXTAP
int int_offset = 16;
unsigned char *src_ptr = src_base + src_offset;
vp8_sixtap_run_cl(cq, src_mem, dst_mem,
(src_ptr-2*src_pixels_per_line),src_offset, src_len,
src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
dst_pitch,dst_len,FData_height,FData_width,output_height,
output_width,int_offset
);
#else
vp8_sixtap_single_pass(
cq,
cl_data.vp8_sixtap_predict8x4_kernel,
cl_data.vp8_sixtap_predict8x4_kernel_size,
FData_height*FData_width,
src_mem,
dst_mem,
src_base,
src_offset,
src_len,
src_pixels_per_line,
xoffset,
yoffset,
dst_base,
dst_offset,
dst_pitch,
dst_len
);
#endif
return;
}
void vp8_sixtap_predict16x16_cl
(
cl_command_queue cq,
unsigned char *src_base,
cl_mem src_mem,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
cl_mem dst_mem,
int dst_offset,
int dst_pitch
) {
int output_width=16, output_height=16, FData_height=21, FData_width=16;
//Size of output to transfer
int dst_len = DST_LEN(dst_pitch,output_height,output_width);
int src_len = SIXTAP_SRC_LEN(FData_width,FData_height,src_pixels_per_line);
#if TWO_PASS_SIXTAP
int int_offset = 32;
unsigned char *src_ptr = src_base + src_offset;
vp8_sixtap_run_cl(cq, src_mem, dst_mem,
(src_ptr-2*src_pixels_per_line),src_offset, src_len,
src_pixels_per_line, xoffset,yoffset,dst_base,dst_offset,
dst_pitch,dst_len,FData_height,FData_width,output_height,
output_width,int_offset
);
#else
vp8_sixtap_single_pass(
cq,
cl_data.vp8_sixtap_predict16x16_kernel,
cl_data.vp8_sixtap_predict16x16_kernel_size,
FData_height*FData_width,
src_mem,
dst_mem,
src_base,
src_offset,
src_len,
src_pixels_per_line,
xoffset,
yoffset,
dst_base,
dst_offset,
dst_pitch,
dst_len
);
#endif
return;
}
void vp8_filter_block2d_bil_first_pass_cl(
cl_command_queue cq,
unsigned char *src_base,
cl_mem src_mem,
int src_offset,
cl_mem int_mem,
int src_pixels_per_line,
int height,
int width,
int xoffset
)
{
int err;
size_t global = width*height;
int free_src = 0;
if (src_mem == NULL){
int src_len = BIL_SRC_LEN(width,height,src_pixels_per_line);
/*Make space for kernel input/output data. Initialize the buffer as well if needed. */
VP8_CL_CREATE_BUF(cq, src_mem, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,
sizeof (unsigned char) * src_len, src_base+src_offset,,
);
src_offset = 0; //Set to zero as long as src_mem starts at base+offset
free_src = 1;
}
err = clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 0, sizeof (cl_mem), &src_mem);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 1, sizeof (int), &src_offset);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 2, sizeof (cl_mem), &int_mem);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 3, sizeof (int), &src_pixels_per_line);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 4, sizeof (int), &height);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 5, sizeof (int), &width);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_first_pass_kernel, 6, sizeof (int), &xoffset);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
,
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_bil_first_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);,
);
if (free_src == 1)
clReleaseMemObject(src_mem);
}
void vp8_filter_block2d_bil_second_pass_cl(
cl_command_queue cq,
cl_mem int_mem,
unsigned char *dst_base,
cl_mem dst_mem,
int dst_offset,
int dst_pitch,
int height,
int width,
int yoffset
)
{
int err;
size_t global = width*height;
//Size of output data
int dst_len = DST_LEN(dst_pitch,height,width);
int free_dst = 0;
if (dst_mem == NULL){
VP8_CL_CREATE_BUF(cq, dst_mem, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR,
sizeof (unsigned char) * dst_len + dst_offset, dst_base,,
);
free_dst = 1;
}
err = clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 0, sizeof (cl_mem), &int_mem);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 1, sizeof (cl_mem), &dst_mem);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 2, sizeof (int), &dst_offset);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 3, sizeof (int), &dst_pitch);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 4, sizeof (int), &height);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 5, sizeof (int), &width);
err |= clSetKernelArg(cl_data.vp8_filter_block2d_bil_second_pass_kernel, 6, sizeof (int), &yoffset);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
,
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel( cq, cl_data.vp8_filter_block2d_bil_second_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);,
);
if (free_dst == 1){
/* Read back the result data from the device */
err = clEnqueueReadBuffer(cq, dst_mem, CL_FALSE, 0, sizeof (unsigned char) * dst_len + dst_offset, dst_base, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to read output array!\n",
,
);
clReleaseMemObject(dst_mem);
}
}
void vp8_bilinear_predict4x4_cl
(
cl_command_queue cq,
unsigned char *src_base,
cl_mem src_mem,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
cl_mem dst_mem,
int dst_offset,
int dst_pitch
) {
const int height = 4, width = 4;
#if !STATIC_MEM
int err;
cl_mem int_mem = NULL;
VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
#endif
/* First filter 1-D horizontally... */
vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
#if !STATIC_MEM
clReleaseMemObject(int_mem);
#endif
}
void vp8_bilinear_predict8x8_cl
(
cl_command_queue cq,
unsigned char *src_base,
cl_mem src_mem,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
cl_mem dst_mem,
int dst_offset,
int dst_pitch
) {
const int height = 8, width = 8;
#if !STATIC_MEM
int err;
cl_mem int_mem = NULL;
VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
#endif
/* First filter 1-D horizontally... */
vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
#if !STATIC_MEM
clReleaseMemObject(int_mem);
#endif
}
void vp8_bilinear_predict8x4_cl
(
cl_command_queue cq,
unsigned char *src_base,
cl_mem src_mem,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
cl_mem dst_mem,
int dst_offset,
int dst_pitch
) {
const int height = 4, width = 8;
#if !STATIC_MEM
int err;
cl_mem int_mem = NULL;
VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
#endif
/* First filter 1-D horizontally... */
vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
#if !STATIC_MEM
clReleaseMemObject(int_mem);
#endif
}
void vp8_bilinear_predict16x16_cl
(
cl_command_queue cq,
unsigned char *src_base,
cl_mem src_mem,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
unsigned char *dst_base,
cl_mem dst_mem,
int dst_offset,
int dst_pitch
) {
const int height = 16, width = 16;
#if !STATIC_MEM
int err;
cl_mem int_mem = NULL;
VP8_CL_CREATE_BUF(NULL, int_mem, NULL, sizeof(cl_int)*21*16, NULL, ,);
#endif
/* First filter 1-D horizontally... */
vp8_filter_block2d_bil_first_pass_cl(cq, src_base, src_mem, src_offset, int_mem, src_pixels_per_line, height + 1, width, xoffset);
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass_cl(cq, int_mem, dst_base, dst_mem, dst_offset, dst_pitch, height, width, yoffset);
#if !STATIC_MEM
clReleaseMemObject(int_mem);
#endif
}

View File

@@ -0,0 +1,562 @@
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_amd_printf : enable
__constant int bilinear_filters[8][2] = {
{ 128, 0},
{ 112, 16},
{ 96, 32},
{ 80, 48},
{ 64, 64},
{ 48, 80},
{ 32, 96},
{ 16, 112}
};
__constant short sub_pel_filters[8][8] = {
//These were originally 8x6, but are padded for vector ops
{ 0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positions are just as per alpha -0.5 bicubic */
{ 0, -6, 123, 12, -1, 0, 0, 0},
{ 2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
{ 0, -9, 93, 50, -6, 0, 0, 0},
{ 3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
{ 0, -6, 50, 93, -9, 0, 0, 0},
{ 1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
{ 0, -1, 12, 123, -6, 0, 0, 0},
};
kernel void vp8_filter_block2d_first_pass_kernel(
__global unsigned char *src_base,
int src_offset,
__global int *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_height,
unsigned int output_width,
int filter_offset
){
uint tid = get_global_id(0);
global unsigned char *src_ptr = &src_base[src_offset];
//Note that src_offset will be reset later, which is why we use it now
int Temp;
__constant short *vp8_filter = sub_pel_filters[filter_offset];
if (tid < (output_width*output_height)){
src_offset = tid + (tid/output_width * (src_pixels_per_line - output_width));
Temp = (int)(src_ptr[src_offset - 2] * vp8_filter[0]) +
(int)(src_ptr[src_offset - 1] * vp8_filter[1]) +
(int)(src_ptr[src_offset] * vp8_filter[2]) +
(int)(src_ptr[src_offset + 1] * vp8_filter[3]) +
(int)(src_ptr[src_offset + 2] * vp8_filter[4]) +
(int)(src_ptr[src_offset + 3] * vp8_filter[5]) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if ( Temp > 255 )
Temp = 255;
output_ptr[tid] = Temp;
}
}
kernel void vp8_filter_block2d_second_pass_kernel
(
__global int *src_base,
int src_offset,
__global unsigned char *output_base,
int output_offset,
int output_pitch,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
int filter_offset
) {
uint i = get_global_id(0);
global int *src_ptr = &src_base[src_offset];
global unsigned char *output_ptr = &output_base[output_offset];
int out_offset; //Not same as output_offset...
int Temp;
int PS2 = 2*(int)pixel_step;
int PS3 = 3*(int)pixel_step;
unsigned int src_increment = src_pixels_per_line - output_width;
__constant short *vp8_filter = sub_pel_filters[filter_offset];
if (i < (output_width * output_height)){
out_offset = i/output_width;
src_offset = out_offset;
src_offset = i + (src_offset * src_increment);
out_offset = i%output_width + (out_offset * output_pitch);
/* Apply filter */
Temp = ((int)src_ptr[src_offset - PS2] * vp8_filter[0]) +
((int)src_ptr[src_offset -(int)pixel_step] * vp8_filter[1]) +
((int)src_ptr[src_offset] * vp8_filter[2]) +
((int)src_ptr[src_offset + pixel_step] * vp8_filter[3]) +
((int)src_ptr[src_offset + PS2] * vp8_filter[4]) +
((int)src_ptr[src_offset + PS3] * vp8_filter[5]) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if (Temp > 255)
Temp = 255;
output_ptr[out_offset] = (unsigned char)Temp;
}
}
kernel void vp8_filter_block2d_bil_first_pass_kernel(
__global unsigned char *src_base,
int src_offset,
__global int *output_ptr,
unsigned int src_pixels_per_line,
unsigned int output_height,
unsigned int output_width,
int filter_offset
)
{
uint tid = get_global_id(0);
if (tid < output_width * output_height){
global unsigned char *src_ptr = &src_base[src_offset];
unsigned int i, j;
__constant int *vp8_filter = bilinear_filters[filter_offset];
unsigned int out_row,out_offset;
int src_increment = src_pixels_per_line - output_width;
i = tid / output_width;
j = tid % output_width;
src_offset = i*(output_width+src_increment) + j;
out_row = output_width * i;
out_offset = out_row + j;
/* Apply bilinear filter */
output_ptr[out_offset] = (((int)src_ptr[src_offset] * vp8_filter[0]) +
((int)src_ptr[src_offset+1] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
}
}
kernel void vp8_filter_block2d_bil_second_pass_kernel
(
__global int *src_ptr,
__global unsigned char *output_base,
int output_offset,
int output_pitch,
unsigned int output_height,
unsigned int output_width,
int filter_offset
)
{
uint tid = get_global_id(0);
if (tid < output_width * output_height){
global unsigned char *output_ptr = &output_base[output_offset];
unsigned int i, j;
int Temp;
__constant int *vp8_filter = bilinear_filters[filter_offset];
int out_offset;
int src_offset;
i = tid / output_width;
j = tid % output_width;
src_offset = i*(output_width) + j;
out_offset = i*output_pitch + j;
/* Apply filter */
Temp = ((int)src_ptr[src_offset] * vp8_filter[0]) +
((int)src_ptr[src_offset+output_width] * vp8_filter[1]) +
(VP8_FILTER_WEIGHT / 2);
output_ptr[out_offset++] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
}
}
//Called from reconinter_cl.c
kernel void vp8_memcpy_kernel(
global unsigned char *src_base,
int src_offset,
int src_stride,
global unsigned char *dst_base,
int dst_offset,
int dst_stride,
int num_bytes,
int num_iter
){
int i,r;
global unsigned char *src = &src_base[src_offset];
global unsigned char *dst = &dst_base[dst_offset];
src_offset = dst_offset = 0;
r = get_global_id(1);
if (r < get_global_size(1)){
i = get_global_id(0);
if (i < get_global_size(0)){
src_offset = r*src_stride + i;
dst_offset = r*dst_stride + i;
dst[dst_offset] = src[src_offset];
}
}
}
//Not used currently.
void vp8_memset_short(
global short *mem,
int offset,
short newval,
unsigned int size
)
{
int tid = get_global_id(0);
if (tid < (size/2)){
mem[offset+tid/2] = newval;
}
}
__kernel void vp8_bilinear_predict4x4_kernel
(
__global unsigned char *src_base,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
__global unsigned char *dst_base,
int dst_offset,
int dst_pitch,
__global int *int_mem
)
{
int Height = 4, Width = 4;
/* First filter 1-D horizontally... */
vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);
}
__kernel void vp8_bilinear_predict8x8_kernel
(
__global unsigned char *src_base,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
__global unsigned char *dst_base,
int dst_offset,
int dst_pitch,
__global int *int_mem
)
{
int Height = 8, Width = 8;
/* First filter 1-D horizontally... */
vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);
}
__kernel void vp8_bilinear_predict8x4_kernel
(
__global unsigned char *src_base,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
__global unsigned char *dst_base,
int dst_offset,
int dst_pitch,
__global int *int_mem
)
{
int Height = 4, Width = 8;
/* First filter 1-D horizontally... */
vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);
}
__kernel void vp8_bilinear_predict16x16_kernel
(
__global unsigned char *src_base,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
__global unsigned char *dst_base,
int dst_offset,
int dst_pitch,
__global int *int_mem
)
{
int Height = 16, Width = 16;
/* First filter 1-D horizontally... */
vp8_filter_block2d_bil_first_pass_kernel(src_base, src_offset, int_mem, src_pixels_per_line, Height + 1, Width, xoffset);
/* then 1-D vertically... */
vp8_filter_block2d_bil_second_pass_kernel(int_mem, dst_base, dst_offset, dst_pitch, Height, Width, yoffset);
}
void vp8_filter_block2d_first_pass(
global unsigned char *src_base,
int src_offset,
local int *output_ptr,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
int filter_offset
){
uint tid = get_global_id(0);
uint i = tid;
int nthreads = get_global_size(0);
int ngroups = nthreads / get_local_size(0);
global unsigned char *src_ptr = &src_base[src_offset];
//Note that src_offset will be reset later, which is why we capture it now
int Temp;
__constant short *vp8_filter = sub_pel_filters[filter_offset];
if (tid < (output_width*output_height)){
short filter0 = vp8_filter[0];
short filter1 = vp8_filter[1];
short filter2 = vp8_filter[2];
short filter3 = vp8_filter[3];
short filter4 = vp8_filter[4];
short filter5 = vp8_filter[5];
if (ngroups > 1){
//This is generally only true on Apple CPU-CL, which gives a group
//size of 1, regardless of the CPU core count.
for (i=0; i < output_width*output_height; i++){
src_offset = i + (i/output_width * (src_pixels_per_line - output_width));
Temp = (int)(src_ptr[src_offset - 2] * filter0) +
(int)(src_ptr[src_offset - 1] * filter1) +
(int)(src_ptr[src_offset] * filter2) +
(int)(src_ptr[src_offset + 1] * filter3) +
(int)(src_ptr[src_offset + 2] * filter4) +
(int)(src_ptr[src_offset + 3] * filter5) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp >>= VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if ( Temp > 255 )
Temp = 255;
output_ptr[i] = Temp;
}
} else {
src_offset = i + (i/output_width * (src_pixels_per_line - output_width));
Temp = (int)(src_ptr[src_offset - 2] * filter0) +
(int)(src_ptr[src_offset - 1] * filter1) +
(int)(src_ptr[src_offset] * filter2) +
(int)(src_ptr[src_offset + 1] * filter3) +
(int)(src_ptr[src_offset + 2] * filter4) +
(int)(src_ptr[src_offset + 3] * filter5) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp >>= VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if ( Temp > 255 )
Temp = 255;
output_ptr[i] = Temp;
}
}
//Add a fence so that no 2nd pass stuff starts before 1st pass writes are done.
barrier(CLK_LOCAL_MEM_FENCE);
}
void vp8_filter_block2d_second_pass
(
local int *src_ptr,
global unsigned char *output_base,
int output_offset,
int output_pitch,
unsigned int src_pixels_per_line,
unsigned int pixel_step,
unsigned int output_height,
unsigned int output_width,
int filter_offset
) {
global unsigned char *output_ptr = &output_base[output_offset];
int out_offset; //Not same as output_offset...
int src_offset;
int Temp;
int PS2 = 2*(int)pixel_step;
int PS3 = 3*(int)pixel_step;
unsigned int src_increment = src_pixels_per_line - output_width;
uint i = get_global_id(0);
__constant short *vp8_filter = sub_pel_filters[filter_offset];
if (i < (output_width * output_height)){
out_offset = i/output_width;
src_offset = out_offset;
src_offset = i + (src_offset * src_increment);
out_offset = i%output_width + (out_offset * output_pitch);
/* Apply filter */
Temp = ((int)src_ptr[src_offset - PS2] * vp8_filter[0]) +
((int)src_ptr[src_offset -(int)pixel_step] * vp8_filter[1]) +
((int)src_ptr[src_offset] * vp8_filter[2]) +
((int)src_ptr[src_offset + pixel_step] * vp8_filter[3]) +
((int)src_ptr[src_offset + PS2] * vp8_filter[4]) +
((int)src_ptr[src_offset + PS3] * vp8_filter[5]) +
(VP8_FILTER_WEIGHT >> 1); /* Rounding */
/* Normalize back to 0-255 */
Temp = Temp >> VP8_FILTER_SHIFT;
if (Temp < 0)
Temp = 0;
else if (Temp > 255)
Temp = 255;
output_ptr[out_offset] = (unsigned char)Temp;
}
}
__kernel void vp8_sixtap_predict_kernel
(
__global unsigned char *src_ptr,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
__global unsigned char *dst_ptr,
int dst_offset,
int dst_pitch
)
{
local int FData[9*4];
/* First filter 1-D horizontally... */
vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 9, 4, xoffset);
/* then filter vertically... */
vp8_filter_block2d_second_pass(&FData[8], dst_ptr, dst_offset, dst_pitch, 4, 4, 4, 4, yoffset);
}
__kernel void vp8_sixtap_predict8x8_kernel
(
__global unsigned char *src_ptr,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
__global unsigned char *dst_ptr,
int dst_offset,
int dst_pitch
)
{
local int FData[13*16]; /* Temp data bufffer used in filtering */
/* First filter 1-D horizontally... */
vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 13, 8, xoffset);
/* then filter vertically... */
vp8_filter_block2d_second_pass(&FData[16], dst_ptr, dst_offset, dst_pitch, 8, 8, 8, 8, yoffset);
}
__kernel void vp8_sixtap_predict8x4_kernel
(
__global unsigned char *src_ptr,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
__global unsigned char *dst_ptr,
int dst_offset,
int dst_pitch
)
{
local int FData[13*16]; /* Temp data buffer used in filtering */
/* First filter 1-D horizontally... */
vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 9, 8, xoffset);
/* then filter verticaly... */
vp8_filter_block2d_second_pass(&FData[16], dst_ptr, dst_offset, dst_pitch, 8, 8, 4, 8, yoffset);
}
__kernel void vp8_sixtap_predict16x16_kernel
(
__global unsigned char *src_ptr,
int src_offset,
int src_pixels_per_line,
int xoffset,
int yoffset,
__global unsigned char *dst_ptr,
int dst_offset,
int dst_pitch
)
{
local int FData[21*24]; /* Temp data buffer used in filtering */
/* First filter 1-D horizontally... */
vp8_filter_block2d_first_pass(src_ptr, src_offset, FData, src_pixels_per_line, 1, 21, 16, xoffset);
/* then filter verticaly... */
vp8_filter_block2d_second_pass(&FData[32], dst_ptr, dst_offset, dst_pitch, 16, 16, 16, 16, yoffset);
return;
}

View File

@@ -0,0 +1,74 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef FILTER_CL_H_
#define FILTER_CL_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "vp8_opencl.h"
#define VP8_FILTER_WEIGHT 128
#define VP8_FILTER_SHIFT 7
#define REGISTER_FILTER 1
#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max;
#define PRE_CALC_PIXEL_STEPS 1
#define PRE_CALC_SRC_INCREMENT 1
#if PRE_CALC_PIXEL_STEPS
#define PS2 two_pixel_steps
#define PS3 three_pixel_steps
#else
#define PS2 2*(int)pixel_step
#define PS3 3*(int)pixel_step
#endif
#if REGISTER_FILTER
#define FILTER0 filter0
#define FILTER1 filter1
#define FILTER2 filter2
#define FILTER3 filter3
#define FILTER4 filter4
#define FILTER5 filter5
#else
#define FILTER0 vp8_filter[0]
#define FILTER1 vp8_filter[1]
#define FILTER2 vp8_filter[2]
#define FILTER3 vp8_filter[3]
#define FILTER4 vp8_filter[4]
#define FILTER5 vp8_filter[5]
#endif
#if PRE_CALC_SRC_INCREMENT
#define SRC_INCREMENT src_increment
#else
#define SRC_INCREMENT (src_pixels_per_line - output_width)
#endif
#define FILTER_OFFSET //Filter data stored as CL constant memory
#define FILTER_REF sub_pel_filters[filter_offset]
extern const char *filterCompileOptions;
extern const char *filter_cl_file_name;
//Copy the -2*pixel_step (and ps*3) bytes because the filter algorithm
//accesses negative indexes
#define SIXTAP_SRC_LEN(out_width,out_height,src_px) ((out_width)*(out_height) + (((out_width)*(out_height)-1)/(out_width))*(src_px - out_width) + 5)
#define BIL_SRC_LEN(out_width,out_height,src_px) ((out_height) * src_px + out_width)
#define DST_LEN(dst_pitch,dst_height,dst_width) (dst_pitch * (dst_height) + (dst_width))
#ifdef __cplusplus
}
#endif
#endif /* FILTER_CL_H_ */

View File

@@ -0,0 +1,45 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef IDCT_OPENCL_H
#define IDCT_OPENCL_H
#ifdef __cplusplus
extern "C" {
#endif
#include "vp8_opencl.h"
#include "vp8/common/blockd.h"
#define prototype_second_order_cl(sym) \
void sym(BLOCKD *b)
#define prototype_idct_cl(sym) \
void sym(BLOCKD *b, int pitch)
#define prototype_idct_scalar_add_cl(sym) \
void sym(BLOCKD *b, cl_int use_diff, int diff_offset, int qcoeff_offset, \
int pred_offset, unsigned char *output, cl_mem out_mem, int out_offset, size_t out_size, \
int pitch, int stride)\
extern prototype_idct_cl(vp8_short_idct4x4llm_1_cl);
extern prototype_idct_cl(vp8_short_idct4x4llm_cl);
extern prototype_idct_scalar_add_cl(vp8_dc_only_idct_add_cl);
extern prototype_second_order_cl(vp8_short_inv_walsh4x4_1_cl);
extern prototype_second_order_cl(vp8_short_inv_walsh4x4_cl);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,325 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
//ACW: Remove me after debugging.
#include <stdio.h>
#include <string.h>
#include "idct_cl.h"
#include "idctllm_cl.h"
#include "blockd_cl.h"
void cl_destroy_idct(){
if (cl_data.idct_program)
clReleaseProgram(cl_data.idct_program);
cl_data.idct_program = NULL;
VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_1_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_dc_only_idct_add_kernel);
//VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_idct4x4llm_1_kernel);
//VP8_CL_RELEASE_KERNEL(cl_data.vp8_short_idct4x4llm_kernel);
}
int cl_init_idct() {
int err;
// Create the filter compute program from the file-defined source code
if (cl_load_program(&cl_data.idct_program, idctllm_cl_file_name,
idctCompileOptions) != CL_SUCCESS)
return VP8_CL_TRIED_BUT_FAILED;
// Create the compute kernel in the program we wish to run
VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_1_kernel,"vp8_short_inv_walsh4x4_1_kernel");
VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_1st_pass_kernel,"vp8_short_inv_walsh4x4_1st_pass_kernel");
VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_inv_walsh4x4_2nd_pass_kernel,"vp8_short_inv_walsh4x4_2nd_pass_kernel");
VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_dc_only_idct_add_kernel,"vp8_dc_only_idct_add_kernel");
////idct4x4llm kernels are only useful for the encoder
//VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_idct4x4llm_1_kernel,"vp8_short_idct4x4llm_1_kernel");
//VP8_CL_CREATE_KERNEL(cl_data,idct_program,vp8_short_idct4x4llm_kernel,"vp8_short_idct4x4llm_kernel");
return CL_SUCCESS;
}
#define max(x,y) (x > y ? x: y)
//#define NO_CL
/* Only useful for encoder... Untested... */
void vp8_short_idct4x4llm_cl(BLOCKD *b, int pitch)
{
int err;
short *input = b->dqcoeff_base + b->dqcoeff_offset;
short *output = &b->diff_base[b->diff_offset];
cl_mem src_mem, dst_mem;
//1 instance for now. This should be split into 2-pass * 4 thread.
size_t global = 1;
if (cl_initialized != CL_SUCCESS){
vp8_short_idct4x4llm_c(input,output,pitch);
return;
}
VP8_CL_CREATE_BUF(b->cl_commands, src_mem,,
sizeof(short)*16, input,
vp8_short_idct4x4llm_c(input,output,pitch),
);
VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,,
sizeof(short)*(4+(pitch/2)*3), output,
vp8_short_idct4x4llm_c(input,output,pitch),
);
//Set arguments and run kernel
err = 0;
err = clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 0, sizeof (cl_mem), &src_mem);
err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 1, sizeof (cl_mem), &dst_mem);
err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_kernel, 2, sizeof (int), &pitch);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
vp8_short_idct4x4llm_c(input,output,pitch),
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_idct4x4llm_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);
vp8_short_idct4x4llm_c(input,output,pitch),
);
/* Read back the result data from the device */
err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0, sizeof(short)*(4+pitch/2*3), output, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS,
"Error: Failed to read output array!\n",
vp8_short_idct4x4llm_c(input,output,pitch),
);
clReleaseMemObject(src_mem);
clReleaseMemObject(dst_mem);
return;
}
/* Only useful for encoder... Untested... */
void vp8_short_idct4x4llm_1_cl(BLOCKD *b, int pitch)
{
int err;
size_t global = 4;
short *input = b->dqcoeff_base + b->dqcoeff_offset;
short *output = &b->diff_base[b->diff_offset];
cl_mem src_mem, dst_mem;
if (cl_initialized != CL_SUCCESS){
vp8_short_idct4x4llm_1_c(input,output,pitch);
return;
}
printf("vp8_short_idct4x4llm_1_cl\n");
VP8_CL_CREATE_BUF(b->cl_commands, src_mem,,
sizeof(short), input,
vp8_short_idct4x4llm_1_c(input,output,pitch),
);
VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,,
sizeof(short)*(4+(pitch/2)*3), output,
vp8_short_idct4x4llm_1_c(input,output,pitch),
);
//Set arguments and run kernel
err = 0;
err = clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 0, sizeof (cl_mem), &src_mem);
err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 1, sizeof (cl_mem), &dst_mem);
err |= clSetKernelArg(cl_data.vp8_short_idct4x4llm_1_kernel, 2, sizeof (int), &pitch);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
vp8_short_idct4x4llm_1_c(input,output,pitch),
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_idct4x4llm_1_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);
vp8_short_idct4x4llm_1_c(input,output,pitch),
);
/* Read back the result data from the device */
err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0, sizeof(short)*(4+pitch/2*3), output, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS,
"Error: Failed to read output array!\n",
vp8_short_idct4x4llm_1_c(input,output,pitch),
);
clReleaseMemObject(src_mem);
clReleaseMemObject(dst_mem);
return;
}
void vp8_dc_only_idct_add_cl(BLOCKD *b, cl_int use_diff, int diff_offset,
int qcoeff_offset, int pred_offset,
unsigned char *dst_base, cl_mem dst_mem, int dst_offset, size_t dest_size,
int pitch, int stride
)
{
int err;
size_t global = 16;
int free_mem = 0;
//cl_mem dest_mem = NULL;
if (dst_mem == NULL){
VP8_CL_CREATE_BUF(b->cl_commands, dst_mem,,
dest_size, dst_base,,
);
free_mem = 1;
}
//Set arguments and run kernel
err = clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 0, sizeof (cl_mem), &b->cl_predictor_mem);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 1, sizeof (int), &pred_offset);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 2, sizeof (cl_mem), &dst_mem);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 3, sizeof (int), &dst_offset);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 4, sizeof (int), &pitch);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 5, sizeof (int), &stride);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 6, sizeof (cl_int), &use_diff);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 7, sizeof (cl_mem), &b->cl_diff_mem);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 8, sizeof (int), &diff_offset);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 9, sizeof (cl_mem), &b->cl_qcoeff_mem);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 10, sizeof (int), &qcoeff_offset);
err |= clSetKernelArg(cl_data.vp8_dc_only_idct_add_kernel, 11, sizeof (cl_mem), &b->cl_dequant_mem);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",,
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_dc_only_idct_add_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);,
);
if (free_mem == 1){
/* Read back the result data from the device */
err = clEnqueueReadBuffer(b->cl_commands, dst_mem, CL_FALSE, 0,
dest_size, dst_base, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS(b->cl_commands, err != CL_SUCCESS,
"Error: Failed to read output array!\n",,
);
clReleaseMemObject(dst_mem);
}
return;
}
void vp8_short_inv_walsh4x4_cl(BLOCKD *b)
{
int err;
size_t global = 4;
if (cl_initialized != CL_SUCCESS){
vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset,&b->diff_base[b->diff_offset]);
return;
}
//Set arguments and run kernel
err = 0;
err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 0, sizeof (cl_mem), &b->cl_dqcoeff_mem);
err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 1, sizeof(int), &b->dqcoeff_offset);
err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 2, sizeof (cl_mem), &b->cl_diff_mem);
err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 3, sizeof(int), &b->diff_offset);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_1st_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);
vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
);
//Second pass
//Set arguments and run kernel
err = 0;
err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 0, sizeof (cl_mem), &b->cl_diff_mem);
err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 1, sizeof(int), &b->diff_offset);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_2nd_pass_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);
vp8_short_inv_walsh4x4_c(b->dqcoeff_base+b->dqcoeff_offset, &b->diff_base[b->diff_offset]),
);
return;
}
void vp8_short_inv_walsh4x4_1_cl(BLOCKD *b)
{
int err;
size_t global = 4;
if (cl_initialized != CL_SUCCESS){
vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset,
&b->diff_base[b->diff_offset]);
return;
}
//Set arguments and run kernel
err = 0;
err = clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 0, sizeof (cl_mem), &b->cl_dqcoeff_mem);
err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 1, sizeof (int), &b->dqcoeff_offset);
err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 2, sizeof (cl_mem), &b->cl_diff_mem);
err |= clSetKernelArg(cl_data.vp8_short_inv_walsh4x4_1_kernel, 3, sizeof (int), &b->diff_offset);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset,
&b->diff_base[b->diff_offset]),
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel(b->cl_commands, cl_data.vp8_short_inv_walsh4x4_1_kernel, 1, NULL, &global, NULL , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( b->cl_commands, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);
vp8_short_inv_walsh4x4_1_c(b->dqcoeff_base + b->dqcoeff_offset,
&b->diff_base[b->diff_offset]),
);
return;
}

View File

@@ -0,0 +1,309 @@
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_amd_printf : enable
__constant int cospi8sqrt2minus1 = 20091;
__constant int sinpi8sqrt2 = 35468;
__constant int rounding = 0;
kernel void vp8_short_idct4x4llm_1st_pass_kernel(global short*,global short *,int);
kernel void vp8_short_idct4x4llm_2nd_pass_kernel(global short*,int);
__kernel void vp8_short_idct4x4llm_kernel(
__global short *input,
__global short *output,
int pitch
){
vp8_short_idct4x4llm_1st_pass_kernel(input,output,pitch);
vp8_short_idct4x4llm_2nd_pass_kernel(output,pitch);
}
__kernel void vp8_short_idct4x4llm_1st_pass_kernel(
__global short *ip,
__global short *op,
int pitch
)
{
int i;
int a1, b1, c1, d1;
int temp1, temp2;
int shortpitch = pitch >> 1;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[8];
b1 = ip[0] - ip[8];
temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
c1 = temp1 - temp2;
temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
d1 = temp1 + temp2;
op[shortpitch*0] = a1 + d1;
op[shortpitch*3] = a1 - d1;
op[shortpitch*1] = b1 + c1;
op[shortpitch*2] = b1 - c1;
ip++;
op++;
}
return;
}
__kernel void vp8_short_idct4x4llm_2nd_pass_kernel(
__global short *output,
int pitch
)
{
int i;
int a1, b1, c1, d1;
int temp1, temp2;
int shortpitch = pitch >> 1;
__global short *ip = output;
__global short *op = output;
for (i = 0; i < 4; i++)
{
a1 = ip[0] + ip[2];
b1 = ip[0] - ip[2];
temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
c1 = temp1 - temp2;
temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
d1 = temp1 + temp2;
op[0] = (a1 + d1 + 4) >> 3;
op[3] = (a1 - d1 + 4) >> 3;
op[1] = (b1 + c1 + 4) >> 3;
op[2] = (b1 - c1 + 4) >> 3;
ip += shortpitch;
op += shortpitch;
}
return;
}
__kernel void vp8_short_idct4x4llm_1_kernel(
__global short *input,
__global short *output,
int pitch
)
{
int a1;
int out_offset;
int shortpitch = pitch >> 1;
//short4 a;
a1 = ((input[0] + 4) >> 3);
//a = a1;
int tid = get_global_id(0);
if (tid < 4){
out_offset = shortpitch * tid;
//vstore4(a,0,&output[out_offset];
output[out_offset] = a1;
output[out_offset+1] = a1;
output[out_offset+2] = a1;
output[out_offset+3] = a1;
}
}
__kernel void vp8_dc_only_idct_add_kernel(
__global unsigned char *pred_base,
int pred_offset,
__global unsigned char *dst_base,
int dst_offset,
int pitch,
int stride,
int use_diff,
global short *diff_base,
int diff_offset,
global short *qcoeff_base,
int qcoeff_offset,
global short *dequant
)
{
int r, c;
//int pred_offset;
global unsigned char *pred_ptr = &pred_base[pred_offset];
global unsigned char *dst_ptr = &dst_base[dst_offset];
int tid = get_global_id(0);
int a1;
if (tid < 16){
if (use_diff == 1){
a1 = diff_base[diff_offset];
} else {
a1 = qcoeff_base[qcoeff_offset] * dequant[0];
}
a1 = (a1 + 4)>>3;
r = tid / 4;
c = tid % 4;
pred_offset = r * pitch;
dst_offset += r * stride;
int a = a1 + pred_ptr[pred_offset + c] ;
if (a < 0)
a = 0;
else if (a > 255)
a = 255;
dst_base[dst_offset + c] = (unsigned char) a ;
}
}
__kernel void vp8_short_inv_walsh4x4_1st_pass_kernel(
__global short *src_base,
int src_offset,
__global short *output_base,
int out_offset
)
{
__global short *input = src_base + src_offset;
__global short *output = output_base + src_offset;
int tid = get_global_id(0);
#define VEC_WALSH 0
#if VEC_WALSH
//4-short vectors to calculate things in
short4 a,b,c,d, a2v, b2v, c2v, d2v, a1t, b1t, c1t, d1t;
short16 out;
if (tid == 0){
//first pass loop in vector form
a = vload4(0,input) + vload4(3,input);
b = vload4(1,input) + vload4(2,input);
c = vload4(1,input) - vload4(2,input);
d = vload4(0,input) - vload4(3,input);
vstore4(a + b, 0, output);
vstore4(c + d, 1, output);
vstore4(a - b, 2, output);
vstore4(d - c, 3, output);
return;
//2nd pass
a = (short4)(output[0], output[4], output[8], output[12]);
b = (short4)(output[1], output[5], output[9], output[13]);
c = (short4)(output[1], output[5], output[9], output[13]);
d = (short4)(output[0], output[4], output[8], output[12]);
a1t = (short4)(output[3], output[7], output[11], output[15]);
b1t = (short4)(output[2], output[6], output[10], output[14]);
c1t = (short4)(output[2], output[6], output[10], output[14]);
d1t = (short4)(output[3], output[7], output[11], output[15]);
a = a + a1t + (short)3;
b = b + b1t;
c = c - c1t;
d = d - d1t + (short)3;
a2v = (a + b) >> (short)3;
b2v = (c + d) >> (short)3;
c2v = (a - b) >> (short)3;
d2v = (d - c) >> (short)3;
out.s048c = a2v;
out.s159d = b2v;
out.s26ae = c2v;
out.s37bf = d2v;
vstore16(out,0,output);
}
#else
int i;
int a1, b1, c1, d1;
int a2, b2, c2, d2;
global short *ip = input;
global short *op = output;
int offset;
if (tid < 4){
offset = tid;
a1 = ip[offset] + ip[offset + 12];
b1 = ip[offset + 4] + ip[offset + 8];
c1 = ip[offset + 4] - ip[offset + 8];
d1 = ip[offset] - ip[offset + 12];
op[offset] = a1 + b1;
op[offset + 4] = c1 + d1;
op[offset + 8] = a1 - b1;
op[offset + 12] = d1 - c1;
}
#endif
}
__kernel void vp8_short_inv_walsh4x4_2nd_pass_kernel(
__global short *output_base,
int out_offset
)
{
int i;
int a1, b1, c1, d1;
int a2, b2, c2, d2;
__global short *output = output_base + out_offset;
int tid = get_global_id(0);
int offset = 0;
if (tid < 4){
offset = 4*tid;
a1 = output[offset] + output[offset + 3];
b1 = output[offset + 1] + output[offset + 2];
c1 = output[offset + 1] - output[offset + 2];
d1 = output[offset + 0] - output[offset + 3];
a2 = a1 + b1;
b2 = c1 + d1;
c2 = a1 - b1;
d2 = d1 - c1;
output[offset + 0] = (a2 + 3) >> 3;
output[offset + 1] = (b2 + 3) >> 3;
output[offset + 2] = (c2 + 3) >> 3;
output[offset + 3] = (d2 + 3) >> 3;
}
}
__kernel void vp8_short_inv_walsh4x4_1_kernel(
__global short *src_data,
int src_offset,
__global short *dst_data,
int dst_offset
){
int a1;
int tid = get_global_id(0);
//short16 a;
int i;
short4 a;
__global short *input = src_data + src_offset;
__global short *output = dst_data + dst_offset;
if (tid < 4)
{
a1 = ((input[0] + 3) >> 3);
a = (short)a1; //Set all elements of vector to a1
vstore4(a, tid, output);
}
}

View File

@@ -0,0 +1,26 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_config.h"
#include "vp8_opencl.h"
#include "vp8/common/blockd.h"
#define CLAMP(x,min,max) if (x < min) x = min; else if ( x > max ) x = max;
//External functions that are fallbacks if CL is unavailable
extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
extern void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride);
extern void vp8_short_inv_walsh4x4_c(short *input, short *output);
extern void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
const char *idctCompileOptions = "-Ivp8/common/opencl";
const char *idctllm_cl_file_name = "vp8/common/opencl/idctllm_cl.cl";

View File

@@ -0,0 +1,427 @@
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#pragma OPENCL EXTENSION cl_amd_printf : enable
typedef unsigned char uc;
typedef signed char sc;
__inline signed char vp8_filter_mask(sc, sc, uc, uc, uc, uc, uc, uc, uc, uc);
__inline signed char vp8_simple_filter_mask(signed char, signed char, uc, uc, uc, uc);
__inline signed char vp8_hevmask(signed char, uc, uc, uc, uc);
__inline signed char vp8_signed_char_clamp(int);
__inline void vp8_mbfilter(signed char mask,signed char hev,global uc *op2,
global uc *op1,global uc *op0,global uc *oq0,global uc *oq1,global uc *oq2);
void vp8_simple_filter(signed char mask,global uc *base, int op1_off,int op0_off,int oq0_off,int oq1_off);
typedef struct
{
signed char lim[16];
signed char flim[16];
signed char thr[16];
signed char mbflim[16];
signed char mbthr[16];
signed char uvlim[16];
signed char uvflim[16];
signed char uvthr[16];
signed char uvmbflim[16];
signed char uvmbthr[16];
} loop_filter_info;
void vp8_filter(
signed char mask,
signed char hev,
global uc *base,
int op1_off,
int op0_off,
int oq0_off,
int oq1_off
)
{
global uc *op1 = &base[op1_off];
global uc *op0 = &base[op0_off];
global uc *oq0 = &base[oq0_off];
global uc *oq1 = &base[oq1_off];
signed char ps0, qs0;
signed char ps1, qs1;
signed char vp8_filter, Filter1, Filter2;
signed char u;
ps1 = (signed char) * op1 ^ 0x80;
ps0 = (signed char) * op0 ^ 0x80;
qs0 = (signed char) * oq0 ^ 0x80;
qs1 = (signed char) * oq1 ^ 0x80;
/* add outer taps if we have high edge variance */
vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
vp8_filter &= hev;
/* inner taps */
vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
vp8_filter &= mask;
/* save bottom 3 bits so that we round one side +4 and the other +3
* if it equals 4 we'll set to adjust by -1 to account for the fact
* we'd round 3 the other way
*/
Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
Filter1 >>= 3;
Filter2 >>= 3;
u = vp8_signed_char_clamp(qs0 - Filter1);
*oq0 = u ^ 0x80;
u = vp8_signed_char_clamp(ps0 + Filter2);
*op0 = u ^ 0x80;
vp8_filter = Filter1;
/* outer tap adjustments */
vp8_filter += 1;
vp8_filter >>= 1;
vp8_filter &= ~hev;
u = vp8_signed_char_clamp(qs1 - vp8_filter);
*oq1 = u ^ 0x80;
u = vp8_signed_char_clamp(ps1 + vp8_filter);
*op1 = u ^ 0x80;
}
kernel void vp8_loop_filter_horizontal_edge_kernel
(
global unsigned char *s_base,
int s_off,
int p, /* pitch */
global signed char *flimit,
global signed char *limit,
global signed char *thresh,
int off_stride
)
{
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = get_global_id(0);
if (i < get_global_size(0)){
s_off += i;
mask = vp8_filter_mask(limit[i], flimit[i], s_base[s_off - 4*p],
s_base[s_off - 3*p], s_base[s_off - 2*p], s_base[s_off - p],
s_base[s_off], s_base[s_off + p], s_base[s_off + 2*p],
s_base[s_off + 3*p]);
hev = vp8_hevmask(thresh[i], s_base[s_off - 2*p], s_base[s_off - p],
s_base[s_off], s_base[s_off+p]);
vp8_filter(mask, hev, s_base, s_off - 2 * p, s_off - p, s_off,
s_off + p);
}
}
kernel void vp8_loop_filter_vertical_edge_kernel
(
global unsigned char *s_base,
int s_off,
int p,
global signed char *flimit,
global signed char *limit,
global signed char *thresh,
int off_stride
)
{
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = get_global_id(0);
if ( i < get_global_size(0) ){
s_off += p * i;
mask = vp8_filter_mask(limit[i], flimit[i],
s_base[s_off-4], s_base[s_off-3], s_base[s_off-2],
s_base[s_off-1], s_base[s_off], s_base[s_off+1],
s_base[s_off+2], s_base[s_off+3]);
hev = vp8_hevmask(thresh[i], s_base[s_off-2], s_base[s_off-1],
s_base[s_off], s_base[s_off+1]);
vp8_filter(mask, hev, s_base, s_off - 2, s_off - 1, s_off, s_off + 1);
}
}
kernel void vp8_mbloop_filter_horizontal_edge_kernel
(
global unsigned char *s_base,
int s_off,
int p,
global signed char *flimit,
global signed char *limit,
global signed char *thresh,
int off_stride
)
{
global uc *s = s_base+s_off;
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = get_global_id(0);
if (i < get_global_size(0)){
s += i;
mask = vp8_filter_mask(limit[i], flimit[i],
s[-4*p], s[-3*p], s[-2*p], s[-1*p],
s[0*p], s[1*p], s[2*p], s[3*p]);
hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
}
}
kernel void vp8_mbloop_filter_vertical_edge_kernel
(
global unsigned char *s_base,
int s_off,
int p,
global signed char *flimit,
global signed char *limit,
global signed char *thresh,
int off_stride
)
{
global uc *s = s_base + s_off;
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = get_global_id(0);
if (i < get_global_size(0)){
s += p * i;
mask = vp8_filter_mask(limit[i], flimit[i],
s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
}
}
kernel void vp8_loop_filter_simple_horizontal_edge_kernel
(
global unsigned char *s_base,
int s_off,
int p,
global const signed char *flimit,
global const signed char *limit,
global const signed char *thresh,
int off_stride
)
{
signed char mask = 0;
int i = get_global_id(0);
(void) thresh;
if (i < get_global_size(0))
{
s_off += i;
mask = vp8_simple_filter_mask(limit[i], flimit[i], s_base[s_off-2*p], s_base[s_off-p], s_base[s_off], s_base[s_off+p]);
vp8_simple_filter(mask, s_base, s_off - 2 * p, s_off - 1 * p, s_off, s_off + 1 * p);
}
}
kernel void vp8_loop_filter_simple_vertical_edge_kernel
(
global unsigned char *s_base,
int s_off,
int p,
global signed char *flimit,
global signed char *limit,
global signed char *thresh,
int off_stride
)
{
signed char mask = 0;
int i = get_global_id(0);
(void) thresh;
if (i < get_global_size(0)){
s_off += p * i;
mask = vp8_simple_filter_mask(limit[i], flimit[i], s_base[s_off-2], s_base[s_off-1], s_base[s_off], s_base[s_off+1]);
vp8_simple_filter(mask, s_base, s_off - 2, s_off - 1, s_off, s_off + 1);
}
}
//Inline and non-kernel functions follow.
__inline void vp8_mbfilter(
signed char mask,
signed char hev,
global uc *op2,
global uc *op1,
global uc *op0,
global uc *oq0,
global uc *oq1,
global uc *oq2
)
{
signed char s, u;
signed char vp8_filter, Filter1, Filter2;
signed char ps2 = (signed char) * op2 ^ 0x80;
signed char ps1 = (signed char) * op1 ^ 0x80;
signed char ps0 = (signed char) * op0 ^ 0x80;
signed char qs0 = (signed char) * oq0 ^ 0x80;
signed char qs1 = (signed char) * oq1 ^ 0x80;
signed char qs2 = (signed char) * oq2 ^ 0x80;
/* add outer taps if we have high edge variance */
vp8_filter = vp8_signed_char_clamp(ps1 - qs1);
vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (qs0 - ps0));
vp8_filter &= mask;
Filter2 = vp8_filter;
Filter2 &= hev;
/* save bottom 3 bits so that we round one side +4 and the other +3 */
Filter1 = vp8_signed_char_clamp(Filter2 + 4);
Filter2 = vp8_signed_char_clamp(Filter2 + 3);
Filter1 >>= 3;
Filter2 >>= 3;
qs0 = vp8_signed_char_clamp(qs0 - Filter1);
ps0 = vp8_signed_char_clamp(ps0 + Filter2);
/* only apply wider filter if not high edge variance */
vp8_filter &= ~hev;
Filter2 = vp8_filter;
/* roughly 3/7th difference across boundary */
u = vp8_signed_char_clamp((63 + Filter2 * 27) >> 7);
s = vp8_signed_char_clamp(qs0 - u);
*oq0 = s ^ 0x80;
s = vp8_signed_char_clamp(ps0 + u);
*op0 = s ^ 0x80;
/* roughly 2/7th difference across boundary */
u = vp8_signed_char_clamp((63 + Filter2 * 18) >> 7);
s = vp8_signed_char_clamp(qs1 - u);
*oq1 = s ^ 0x80;
s = vp8_signed_char_clamp(ps1 + u);
*op1 = s ^ 0x80;
/* roughly 1/7th difference across boundary */
u = vp8_signed_char_clamp((63 + Filter2 * 9) >> 7);
s = vp8_signed_char_clamp(qs2 - u);
*oq2 = s ^ 0x80;
s = vp8_signed_char_clamp(ps2 + u);
*op2 = s ^ 0x80;
}
__inline signed char vp8_signed_char_clamp(int t)
{
t = (t < -128 ? -128 : t);
t = (t > 127 ? 127 : t);
return (signed char) t;
}
/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
__inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
{
signed char hev = 0;
hev |= (abs(p1 - p0) > thresh) * -1;
hev |= (abs(q1 - q0) > thresh) * -1;
return hev;
}
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
__inline signed char vp8_filter_mask(
signed char limit,
signed char flimit,
uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
{
signed char mask = 0;
mask |= (abs(p3 - p2) > limit) * -1;
mask |= (abs(p2 - p1) > limit) * -1;
mask |= (abs(p1 - p0) > limit) * -1;
mask |= (abs(q1 - q0) > limit) * -1;
mask |= (abs(q2 - q1) > limit) * -1;
mask |= (abs(q3 - q2) > limit) * -1;
mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1;
mask = ~mask;
return mask;
}
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
__inline signed char vp8_simple_filter_mask(
signed char limit,
signed char flimit,
uc p1,
uc p0,
uc q0,
uc q1
)
{
signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1;
return mask;
}
void vp8_simple_filter(
signed char mask,
global uc *base,
int op1_off,
int op0_off,
int oq0_off,
int oq1_off
)
{
global uc *op1 = base + op1_off;
global uc *op0 = base + op0_off;
global uc *oq0 = base + oq0_off;
global uc *oq1 = base + oq1_off;
signed char vp8_filter, Filter1, Filter2;
signed char p1 = (signed char) * op1 ^ 0x80;
signed char p0 = (signed char) * op0 ^ 0x80;
signed char q0 = (signed char) * oq0 ^ 0x80;
signed char q1 = (signed char) * oq1 ^ 0x80;
signed char u;
vp8_filter = vp8_signed_char_clamp(p1 - q1);
vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * (q0 - p0));
vp8_filter &= mask;
/* save bottom 3 bits so that we round one side +4 and the other +3 */
Filter1 = vp8_signed_char_clamp(vp8_filter + 4);
Filter1 >>= 3;
u = vp8_signed_char_clamp(q0 - Filter1);
*oq0 = u ^ 0x80;
Filter2 = vp8_signed_char_clamp(vp8_filter + 3);
Filter2 >>= 3;
u = vp8_signed_char_clamp(p0 + Filter2);
*op0 = u ^ 0x80;
}

View File

@@ -0,0 +1,457 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "../../../vpx_ports/config.h"
#include "loopfilter_cl.h"
#include "../onyxc_int.h"
#include "vpx_config.h"
#include "vp8_opencl.h"
#include "blockd_cl.h"
const char *loopFilterCompileOptions = "-Ivp8/common/opencl";
const char *loop_filter_cl_file_name = "vp8/common/opencl/loopfilter.cl";
typedef unsigned char uc;
extern void vp8_loop_filter_frame
(
VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl
);
prototype_loopfilter_cl(vp8_loop_filter_horizontal_edge_cl);
prototype_loopfilter_cl(vp8_loop_filter_vertical_edge_cl);
prototype_loopfilter_cl(vp8_mbloop_filter_horizontal_edge_cl);
prototype_loopfilter_cl(vp8_mbloop_filter_vertical_edge_cl);
prototype_loopfilter_cl(vp8_loop_filter_simple_horizontal_edge_cl);
prototype_loopfilter_cl(vp8_loop_filter_simple_vertical_edge_cl);
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_cl(
MACROBLOCKD *x,
cl_mem buf_base,
int y_off,
int u_off,
int v_off,
int y_stride,
int uv_stride,
loop_filter_info *lfi,
int simpler_lpf
)
{
(void) simpler_lpf;
vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, u_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
vp8_mbloop_filter_horizontal_edge_cl(x, buf_base, v_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
}
void vp8_loop_filter_mbhs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_mbloop_filter_vertical_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
vp8_mbloop_filter_vertical_edge_cl(x, buf_base, u_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
vp8_mbloop_filter_vertical_edge_cl(x, buf_base, v_off, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1, 1);
}
void vp8_loop_filter_mbvs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2, 1);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_horizontal_edge_cl(x, buf_base, y_off + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_horizontal_edge_cl(x, buf_base, u_off + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
vp8_loop_filter_horizontal_edge_cl(x, buf_base, v_off + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
}
void vp8_loop_filter_bhs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_simple_horizontal_edge_cl(x, buf_base, y_off + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) simpler_lpf;
vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_vertical_edge_cl(x, buf_base, y_off + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_vertical_edge_cl(x, buf_base, u_off + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
vp8_loop_filter_vertical_edge_cl(x, buf_base, v_off + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1, 1);
}
void vp8_loop_filter_bvs_cl(MACROBLOCKD *x, cl_mem buf_base, int y_off, int u_off, int v_off,
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void) uv_stride;
(void) simpler_lpf;
vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
vp8_loop_filter_simple_vertical_edge_cl(x, buf_base, y_off + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2, 1);
}
void vp8_init_loop_filter_cl(VP8_COMMON *cm)
{
loop_filter_info *lfi = cm->lf_info;
int sharpness_lvl = cm->sharpness_level;
int frame_type = cm->frame_type;
int i, j;
int block_inside_limit = 0;
int HEVThresh;
const int yhedge_boost = 2;
/* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
{
int filt_lvl = i;
if (frame_type == KEY_FRAME)
{
if (filt_lvl >= 40)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
else
{
if (filt_lvl >= 40)
HEVThresh = 3;
else if (filt_lvl >= 20)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
/* Set loop filter paramaeters that control sharpness. */
block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
block_inside_limit = block_inside_limit >> (sharpness_lvl > 4);
if (sharpness_lvl > 0)
{
if (block_inside_limit > (9 - sharpness_lvl))
block_inside_limit = (9 - sharpness_lvl);
}
if (block_inside_limit < 1)
block_inside_limit = 1;
for (j = 0; j < 16; j++)
{
lfi[i].lim[j] = block_inside_limit;
lfi[i].mbflim[j] = filt_lvl + yhedge_boost;
lfi[i].flim[j] = filt_lvl;
lfi[i].thr[j] = HEVThresh;
}
}
}
/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
* each frame. Check last_frame_type to skip the function most of times.
*/
void vp8_frame_init_loop_filter_cl(loop_filter_info *lfi, int frame_type)
{
int HEVThresh;
int i, j;
/* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
{
int filt_lvl = i;
if (frame_type == KEY_FRAME)
{
if (filt_lvl >= 40)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
else
{
if (filt_lvl >= 40)
HEVThresh = 3;
else if (filt_lvl >= 20)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
for (j = 0; j < 16; j++)
{
lfi[i].thr[j] = HEVThresh;
}
}
}
//This might not need to be copied from loopfilter.c
void vp8_adjust_mb_lf_value_cl(MACROBLOCKD *mbd, int *filter_level)
{
MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;
if (mbd->mode_ref_lf_delta_enabled)
{
/* Apply delta for reference frame */
*filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
/* Apply delta for mode */
if (mbmi->ref_frame == INTRA_FRAME)
{
/* Only the split mode BPRED has a further special case */
if (mbmi->mode == B_PRED)
*filter_level += mbd->mode_lf_deltas[0];
}
else
{
/* Zero motion mode */
if (mbmi->mode == ZEROMV)
*filter_level += mbd->mode_lf_deltas[1];
/* Split MB motion mode */
else if (mbmi->mode == SPLITMV)
*filter_level += mbd->mode_lf_deltas[3];
/* All other inter motion modes (Nearest, Near, New) */
else
*filter_level += mbd->mode_lf_deltas[2];
}
/* Range check */
if (*filter_level > MAX_LOOP_FILTER)
*filter_level = MAX_LOOP_FILTER;
else if (*filter_level < 0)
*filter_level = 0;
}
}
//Start of externally callable functions.
int cl_init_loop_filter() {
int err;
// Create the filter compute program from the file-defined source code
if ( cl_load_program(&cl_data.loop_filter_program, loop_filter_cl_file_name,
loopFilterCompileOptions) != CL_SUCCESS )
return VP8_CL_TRIED_BUT_FAILED;
// Create the compute kernels in the program we wish to run
VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_horizontal_edge_kernel,"vp8_loop_filter_horizontal_edge_kernel");
VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_vertical_edge_kernel,"vp8_loop_filter_vertical_edge_kernel");
VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_mbloop_filter_horizontal_edge_kernel,"vp8_mbloop_filter_horizontal_edge_kernel");
VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_mbloop_filter_vertical_edge_kernel,"vp8_mbloop_filter_vertical_edge_kernel");
VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_simple_horizontal_edge_kernel,"vp8_loop_filter_simple_horizontal_edge_kernel");
VP8_CL_CREATE_KERNEL(cl_data,loop_filter_program,vp8_loop_filter_simple_vertical_edge_kernel,"vp8_loop_filter_simple_vertical_edge_kernel");
return CL_SUCCESS;
}
void cl_destroy_loop_filter(){
if (cl_data.loop_filter_program)
clReleaseProgram(cl_data.loop_filter_program);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_horizontal_edge_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_vertical_edge_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_mbloop_filter_horizontal_edge_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_mbloop_filter_vertical_edge_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_simple_horizontal_edge_kernel);
VP8_CL_RELEASE_KERNEL(cl_data.vp8_loop_filter_simple_vertical_edge_kernel);
cl_data.loop_filter_program = NULL;
}
void vp8_loop_filter_set_baselines_cl(MACROBLOCKD *mbd, int default_filt_lvl, int *baseline_filter_level){
int alt_flt_enabled = mbd->segmentation_enabled;
int i;
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
/* Delta Value */
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
else
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
baseline_filter_level[i] = default_filt_lvl;
}
}
void vp8_loop_filter_frame_cl
(
VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl
)
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
loop_filter_info *lfi = cm->lf_info;
FRAME_TYPE frame_type = cm->frame_type;
LOOPFILTERTYPE filter_type = cm->filter_type;
int mb_row;
int mb_col;
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
int err;
unsigned char *buf_base;
int y_off, u_off, v_off;
//unsigned char *y_ptr, *u_ptr, *v_ptr;
mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */
/* Note the baseline filter values for each segment */
vp8_loop_filter_set_baselines_cl(mbd, default_filt_lvl, baseline_filter_level);
/* Initialize the loop filter for this frame. */
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter_cl(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter_cl(lfi, frame_type);
/* Set up the buffer pointers */
buf_base = post->buffer_alloc;
y_off = post->y_buffer - buf_base;
u_off = post->u_buffer - buf_base;
v_off = post->v_buffer - buf_base;
VP8_CL_SET_BUF(mbd->cl_commands, post->buffer_mem, post->buffer_size, post->buffer_alloc,
vp8_loop_filter_frame(cm,mbd,default_filt_lvl),);
/* vp8_filter each macro block */
for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
{
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
filter_level = baseline_filter_level[Segment];
/* Distance of Mb to the various image edges.
* These specified to 8th pel as they are always compared to values
* that are in 1/8th pel units. Apply any context driven MB level
* adjustment
*/
filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
if (filter_level)
{
if (mb_col > 0){
if (filter_type == NORMAL_LOOPFILTER)
vp8_loop_filter_mbv_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
else
vp8_loop_filter_mbvs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
}
if (mbd->mode_info_context->mbmi.dc_diff > 0){
if (filter_type == NORMAL_LOOPFILTER)
vp8_loop_filter_bv_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
else
vp8_loop_filter_bvs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
}
/* don't apply across umv border */
if (mb_row > 0){
if (filter_type == NORMAL_LOOPFILTER)
vp8_loop_filter_mbh_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
else
vp8_loop_filter_mbhs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
}
if (mbd->mode_info_context->mbmi.dc_diff > 0){
if (filter_type == NORMAL_LOOPFILTER)
vp8_loop_filter_bh_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
else
vp8_loop_filter_bhs_cl(mbd, post->buffer_mem, y_off, u_off, v_off, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
}
}
y_off += 16;
u_off += 8;
v_off += 8;
mbd->mode_info_context++; /* step to next MB */
}
y_off += post->y_stride * 16 - post->y_width;
u_off += post->uv_stride * 8 - post->uv_width;
v_off += post->uv_stride * 8 - post->uv_width;
mbd->mode_info_context++; /* Skip border mb */
}
//Retrieve buffer contents
err = clEnqueueReadBuffer(mbd->cl_commands, post->buffer_mem, CL_FALSE, 0, post->buffer_size, post->buffer_alloc, 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS(mbd->cl_commands, err != CL_SUCCESS,
"Error: Failed to read loop filter output!\n",
,
);
VP8_CL_FINISH(mbd->cl_commands);
}

View File

@@ -0,0 +1,48 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef loopfilter_cl_h
#define loopfilter_cl_h
#include "../../../vpx_ports/mem.h"
#include "../onyxc_int.h"
#include "blockd_cl.h"
#include "../loopfilter.h"
#define prototype_loopfilter_cl(sym) \
void sym(MACROBLOCKD*, cl_mem src_base, int src_offset, \
int pitch, const signed char *flimit, \
const signed char *limit, const signed char *thresh, int count, int block_cnt)
#define prototype_loopfilter_block_cl(sym) \
void sym(MACROBLOCKD*, unsigned char *y, unsigned char *u, unsigned char *v,\
int ystride, int uv_stride, loop_filter_info *lfi, int simpler)
extern void vp8_loop_filter_frame_cl
(
VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl
);
extern prototype_loopfilter_block_cl(vp8_lf_normal_mb_v_cl);
extern prototype_loopfilter_block_cl(vp8_lf_normal_b_v_cl);
extern prototype_loopfilter_block_cl(vp8_lf_normal_mb_h_cl);
extern prototype_loopfilter_block_cl(vp8_lf_normal_b_h_cl);
extern prototype_loopfilter_block_cl(vp8_lf_simple_mb_v_cl);
extern prototype_loopfilter_block_cl(vp8_lf_simple_b_v_cl);
extern prototype_loopfilter_block_cl(vp8_lf_simple_mb_h_cl);
extern prototype_loopfilter_block_cl(vp8_lf_simple_b_h_cl);
typedef prototype_loopfilter_block_cl((*vp8_lf_block_cl_fn_t));
#endif

View File

@@ -0,0 +1,187 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdlib.h>
#include <stdio.h>
#include "vpx_ports/config.h"
#include "vp8_opencl.h"
#include "blockd_cl.h"
//#include "loopfilter_cl.h"
//#include "../onyxc_int.h"
typedef unsigned char uc;
static void vp8_loop_filter_cl_run(
cl_command_queue cq,
cl_kernel kernel,
cl_mem buf_mem,
int s_off,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count,
int block_cnt
){
size_t global[] = {count,block_cnt};
int err;
cl_mem flimit_mem;
cl_mem limit_mem;
cl_mem thresh_mem;
VP8_CL_CREATE_BUF(cq, flimit_mem, , sizeof(uc)*16, flimit,, );
VP8_CL_CREATE_BUF(cq, limit_mem, , sizeof(uc)*16, limit,, );
VP8_CL_CREATE_BUF(cq, thresh_mem, , sizeof(uc)*16, thresh,, );
err = 0;
err = clSetKernelArg(kernel, 0, sizeof (cl_mem), &buf_mem);
err |= clSetKernelArg(kernel, 1, sizeof (cl_int), &s_off);
err |= clSetKernelArg(kernel, 2, sizeof (cl_int), &p);
err |= clSetKernelArg(kernel, 3, sizeof (cl_mem), &flimit_mem);
err |= clSetKernelArg(kernel, 4, sizeof (cl_mem), &limit_mem);
err |= clSetKernelArg(kernel, 5, sizeof (cl_mem), &thresh_mem);
err |= clSetKernelArg(kernel, 6, sizeof (cl_int), &block_cnt);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",,
);
/* Execute the kernel */
err = clEnqueueNDRangeKernel(cq, kernel, 2, NULL, global, NULL , 0, NULL, NULL);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
printf("err = %d\n",err);,
);
clReleaseMemObject(flimit_mem);
clReleaseMemObject(limit_mem);
clReleaseMemObject(thresh_mem);
VP8_CL_FINISH(cq);
}
void vp8_loop_filter_horizontal_edge_cl
(
MACROBLOCKD *x,
cl_mem s_base,
int s_off,
int p, /* pitch */
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count,
int block_cnt
)
{
vp8_loop_filter_cl_run(x->cl_commands,
cl_data.vp8_loop_filter_horizontal_edge_kernel, s_base, s_off,
p, flimit, limit, thresh, count*8, block_cnt
);
}
void vp8_loop_filter_vertical_edge_cl
(
MACROBLOCKD *x,
cl_mem s_base,
int s_off,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count,
int block_cnt
)
{
vp8_loop_filter_cl_run(x->cl_commands,
cl_data.vp8_loop_filter_vertical_edge_kernel, s_base, s_off,
p, flimit, limit, thresh, count*8, block_cnt
);
}
void vp8_mbloop_filter_horizontal_edge_cl
(
MACROBLOCKD *x,
cl_mem s_base,
int s_off,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count,
int block_cnt
)
{
vp8_loop_filter_cl_run(x->cl_commands,
cl_data.vp8_mbloop_filter_horizontal_edge_kernel, s_base, s_off,
p, flimit, limit, thresh, count*8, block_cnt
);
}
void vp8_mbloop_filter_vertical_edge_cl
(
MACROBLOCKD *x,
cl_mem s_base,
int s_off,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count,
int block_cnt
)
{
vp8_loop_filter_cl_run(x->cl_commands,
cl_data.vp8_mbloop_filter_vertical_edge_kernel, s_base, s_off,
p, flimit, limit, thresh, count*8, block_cnt
);
}
void vp8_loop_filter_simple_horizontal_edge_cl
(
MACROBLOCKD *x,
cl_mem s_base,
int s_off,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count,
int block_cnt
)
{
vp8_loop_filter_cl_run(x->cl_commands,
cl_data.vp8_loop_filter_simple_horizontal_edge_kernel, s_base, s_off,
p, flimit, limit, thresh, count*8, block_cnt
);
}
void vp8_loop_filter_simple_vertical_edge_cl
(
MACROBLOCKD *x,
cl_mem s_base,
int s_off,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count,
int block_cnt
)
{
vp8_loop_filter_cl_run(x->cl_commands,
cl_data.vp8_loop_filter_simple_vertical_edge_kernel, s_base, s_off,
p, flimit, limit, thresh, count*8, block_cnt
);
}

View File

@@ -0,0 +1,41 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "../subpixel.h"
#include "subpixel_cl.h"
#include "../onyxc_int.h"
#include "vp8_opencl.h"
#if HAVE_DLOPEN
#include "dynamic_cl.h"
#endif
void vp8_arch_opencl_common_init(VP8_COMMON *ctx)
{
#if HAVE_DLOPEN
#if WIN32 //Windows .dll has no lib prefix and no extension
cl_loaded = load_cl("OpenCL");
#else //But *nix needs full name
cl_loaded = load_cl("libOpenCL.so");
#endif
if (cl_loaded == CL_SUCCESS)
cl_initialized = cl_common_init();
else
cl_initialized = VP8_CL_TRIED_BUT_FAILED;
#else //!HAVE_DLOPEN (e.g. Apple)
cl_initialized = cl_common_init();
#endif
}

View File

@@ -0,0 +1,641 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
//for the decoder, all subpixel prediction is done in this file.
//
//Need to determine some sort of mechanism for easily determining SIXTAP/BILINEAR
//and what arguments to feed into the kernels. These kernels SHOULD be 2-pass,
//and ideally there'd be a data structure that determined what static arguments
//to pass in.
//
//Also, the only external functions being called here are the subpixel prediction
//functions. Hopefully this means no worrying about when to copy data back/forth.
#include "../../../vpx_ports/config.h"
//#include "../recon.h"
#include "../subpixel.h"
//#include "../blockd.h"
//#include "../reconinter.h"
#if CONFIG_RUNTIME_CPU_DETECT
//#include "../onyxc_int.h"
#endif
#include "vp8_opencl.h"
#include "filter_cl.h"
#include "reconinter_cl.h"
#include "blockd_cl.h"
#include <stdio.h>
/* use this define on systems where unaligned int reads and writes are
* not allowed, i.e. ARM architectures
*/
/*#define MUST_BE_ALIGNED*/
static const int bbb[4] = {0, 2, 8, 10};
static void vp8_memcpy(
unsigned char *src_base,
int src_offset,
int src_stride,
unsigned char *dst_base,
int dst_offset,
int dst_stride,
int num_bytes,
int num_iter
){
int i,r;
unsigned char *src = &src_base[src_offset];
unsigned char *dst = &dst_base[dst_offset];
src_offset = dst_offset = 0;
for (r = 0; r < num_iter; r++){
for (i = 0; i < num_bytes; i++){
src_offset = r*src_stride + i;
dst_offset = r*dst_stride + i;
dst[dst_offset] = src[src_offset];
}
}
}
static void vp8_copy_mem_cl(
cl_command_queue cq,
cl_mem src_mem,
int *src_offsets,
int src_stride,
cl_mem dst_mem,
int *dst_offsets,
int dst_stride,
int num_bytes,
int num_iter,
int num_blocks
){
int err,block;
#if MEM_COPY_KERNEL
size_t global[3] = {num_bytes, num_iter, num_blocks};
size_t local[3];
local[0] = global[0];
local[1] = global[1];
local[2] = global[2];
err = clSetKernelArg(cl_data.vp8_memcpy_kernel, 0, sizeof (cl_mem), &src_mem);
err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 2, sizeof (int), &src_stride);
err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 3, sizeof (cl_mem), &dst_mem);
err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 5, sizeof (int), &dst_stride);
err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 6, sizeof (int), &num_bytes);
err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 7, sizeof (int), &num_iter);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
return,
);
for (block = 0; block < num_blocks; block++){
/* Set kernel arguments */
err = clSetKernelArg(cl_data.vp8_memcpy_kernel, 1, sizeof (int), &src_offsets[block]);
err |= clSetKernelArg(cl_data.vp8_memcpy_kernel, 4, sizeof (int), &dst_offsets[block]);
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to set kernel arguments!\n",
return,
);
/* Execute the kernel */
if (num_bytes * num_iter > cl_data.vp8_memcpy_kernel_size){
err = clEnqueueNDRangeKernel( cq, cl_data.vp8_memcpy_kernel, 2, NULL, global, NULL , 0, NULL, NULL);
} else {
err = clEnqueueNDRangeKernel( cq, cl_data.vp8_memcpy_kernel, 2, NULL, global, local , 0, NULL, NULL);
}
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS,
"Error: Failed to execute kernel!\n",
return,
);
}
#else
int iter;
for (block=0; block < num_blocks; block++){
for (iter = 0; iter < num_iter; iter++){
err = clEnqueueCopyBuffer(cq, src_mem, dst_mem,
src_offsets[block]+iter*src_stride,
dst_offsets[block]+iter*dst_stride,
num_bytes, 0, NULL, NULL
);
VP8_CL_CHECK_SUCCESS(cq, err != CL_SUCCESS, "Error copying between buffers\n",
,
);
}
}
#endif
}
static void vp8_build_inter_predictors_b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch)
{
unsigned char *ptr_base = *(d->base_pre);
int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
vp8_subpix_cl_fn_t sppf;
int pre_dist = *d->base_pre - x->pre.buffer_alloc;
cl_mem pre_mem = x->pre.buffer_mem;
int pre_off = pre_dist+ptr_offset;
if (d->sixtap_filter == CL_TRUE)
sppf = vp8_sixtap_predict4x4_cl;
else
sppf = vp8_bilinear_predict4x4_cl;
//ptr_base a.k.a. d->base_pre is the start of the
//Macroblock's y_buffer, u_buffer, or v_buffer
if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
{
sppf(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
}
else
{
vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride,d->cl_predictor_mem, &d->predictor_offset,pitch,4,4,1);
}
}
static void vp8_build_inter_predictors4b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch)
{
unsigned char *ptr_base = *(d->base_pre);
int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
int pre_dist = *d->base_pre - x->pre.buffer_alloc;
cl_mem pre_mem = x->pre.buffer_mem;
int pre_off = pre_dist + ptr_offset;
//If there's motion in the bottom 8 subpixels, need to do subpixel prediction
if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
{
if (d->sixtap_filter == CL_TRUE)
vp8_sixtap_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
else
vp8_bilinear_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
}
//Otherwise copy memory directly from src to dest
else
{
vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride, d->cl_predictor_mem, &d->predictor_offset, pitch, 8, 8, 1);
}
}
static void vp8_build_inter_predictors2b_cl(MACROBLOCKD *x, BLOCKD *d, int pitch)
{
unsigned char *ptr_base = *(d->base_pre);
int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
int pre_dist = *d->base_pre - x->pre.buffer_alloc;
cl_mem pre_mem = x->pre.buffer_mem;
int pre_off = pre_dist+ptr_offset;
if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
{
if (d->sixtap_filter == CL_TRUE)
vp8_sixtap_predict8x4_cl(d->cl_commands,ptr_base,pre_mem,pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
else
vp8_bilinear_predict8x4_cl(d->cl_commands,ptr_base,pre_mem,pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, d->predictor_base, d->cl_predictor_mem, d->predictor_offset, pitch);
}
else
{
vp8_copy_mem_cl(d->cl_commands, pre_mem, &pre_off, d->pre_stride, d->cl_predictor_mem, &d->predictor_offset, pitch, 8, 4, 1);
}
}
void vp8_build_inter_predictors_mbuv_cl(MACROBLOCKD *x)
{
int i;
vp8_cl_mb_prep(x, PREDICTOR|PRE_BUF);
#if !ONE_CQ_PER_MB
VP8_CL_FINISH(x->cl_commands);
#endif
if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
x->mode_info_context->mbmi.mode != SPLITMV)
{
unsigned char *pred_base = x->predictor;
int upred_offset = 256;
int vpred_offset = 320;
int mv_row = x->block[16].bmi.mv.as_mv.row;
int mv_col = x->block[16].bmi.mv.as_mv.col;
int offset;
unsigned char *pre_base = x->pre.buffer_alloc;
cl_mem pre_mem = x->pre.buffer_mem;
int upre_off = x->pre.u_buffer - pre_base;
int vpre_off = x->pre.v_buffer - pre_base;
int pre_stride = x->block[16].pre_stride;
offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
if ((mv_row | mv_col) & 7)
{
if (cl_initialized == CL_SUCCESS && x->sixtap_filter == CL_TRUE){
vp8_sixtap_predict8x8_cl(x->block[16].cl_commands,pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
vp8_sixtap_predict8x8_cl(x->block[20].cl_commands,pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
}
else{
vp8_bilinear_predict8x8_cl(x->block[16].cl_commands,pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
vp8_bilinear_predict8x8_cl(x->block[20].cl_commands,pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
}
}
else
{
int pre_offsets[2] = {upre_off+offset, vpre_off+offset};
int pred_offsets[2] = {upred_offset,vpred_offset};
vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, pre_offsets, pre_stride, x->cl_predictor_mem, pred_offsets, 8, 8, 8, 2);
}
}
else
{
// Can probably batch these operations as well, but not tested in decoder
// (or at least the test videos I've been using.
for (i = 16; i < 24; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
vp8_build_inter_predictors2b_cl(x, d0, 8);
else
{
vp8_build_inter_predictors_b_cl(x, d0, 8);
vp8_build_inter_predictors_b_cl(x, d1, 8);
}
}
}
#if !ONE_CQ_PER_MB
VP8_CL_FINISH(x->block[0].cl_commands);
VP8_CL_FINISH(x->block[16].cl_commands);
VP8_CL_FINISH(x->block[20].cl_commands);
#endif
vp8_cl_mb_finish(x, PREDICTOR);
}
void vp8_build_inter_predictors_mb_cl(MACROBLOCKD *x)
{
//If CL is running in encoder, need to call following before proceeding.
//vp8_cl_mb_prep(x, PRE_BUF);
#if !ONE_CQ_PER_MB
VP8_CL_FINISH(x->cl_commands);
#endif
if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
x->mode_info_context->mbmi.mode != SPLITMV)
{
int offset;
unsigned char *pred_base = x->predictor;
int upred_offset = 256;
int vpred_offset = 320;
int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
int pre_stride = x->block[0].pre_stride;
unsigned char *pre_base = x->pre.buffer_alloc;
cl_mem pre_mem = x->pre.buffer_mem;
int ypre_off = x->pre.y_buffer - pre_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
int upre_off = x->pre.u_buffer - pre_base;
int vpre_off = x->pre.v_buffer - pre_base;
if ((mv_row | mv_col) & 7)
{
if (cl_initialized == CL_SUCCESS && x->sixtap_filter == CL_TRUE){
vp8_sixtap_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, 0, 16);
}
else
vp8_bilinear_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, 0, 16);
}
else
{
//16x16 copy
int pred_off = 0;
vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &ypre_off, pre_stride, x->cl_predictor_mem, &pred_off, 16, 16, 16, 1);
}
mv_row = x->block[16].bmi.mv.as_mv.row;
mv_col = x->block[16].bmi.mv.as_mv.col;
pre_stride >>= 1;
offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
if ((mv_row | mv_col) & 7)
{
if (x->sixtap_filter == CL_TRUE){
vp8_sixtap_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
vp8_sixtap_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
}
else {
vp8_bilinear_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, upred_offset, 8);
vp8_bilinear_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, pred_base, x->cl_predictor_mem, vpred_offset, 8);
}
}
else
{
int pre_off = upre_off + offset;
vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, &pre_off, pre_stride, x->cl_predictor_mem, &upred_offset, 8, 8, 8, 1);
pre_off = vpre_off + offset;
vp8_copy_mem_cl(x->block[20].cl_commands, pre_mem, &pre_off, pre_stride, x->cl_predictor_mem, &vpred_offset, 8, 8, 8, 1);
}
}
else
{
int i;
if (x->mode_info_context->mbmi.partitioning < 3)
{
for (i = 0; i < 4; i++)
{
BLOCKD *d = &x->block[bbb[i]];
vp8_build_inter_predictors4b_cl(x, d, 16);
}
}
else
{
/* This loop can be done in any order... No dependencies.*/
/* Also, d0/d1 can be decoded simultaneously */
for (i = 0; i < 16; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
vp8_build_inter_predictors2b_cl(x, d0, 16);
else
{
vp8_build_inter_predictors_b_cl(x, d0, 16);
vp8_build_inter_predictors_b_cl(x, d1, 16);
}
}
}
/* Another case of re-orderable/batchable loop */
for (i = 16; i < 24; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
vp8_build_inter_predictors2b_cl(x, d0, 8);
else
{
vp8_build_inter_predictors_b_cl(x, d0, 8);
vp8_build_inter_predictors_b_cl(x, d1, 8);
}
}
}
#if !ONE_CQ_PER_MB
VP8_CL_FINISH(x->block[0].cl_commands);
VP8_CL_FINISH(x->block[16].cl_commands);
VP8_CL_FINISH(x->block[20].cl_commands);
#endif
vp8_cl_mb_finish(x, PREDICTOR);
}
/* The following functions are written for skip_recon_mb() to call. Since there is no recon in this
* situation, we can write the result directly to dst buffer instead of writing it to predictor
* buffer and then copying it to dst buffer.
*/
static void vp8_build_inter_predictors_b_s_cl(MACROBLOCKD *x, BLOCKD *d, int dst_offset)
{
unsigned char *ptr_base = *(d->base_pre);
int dst_stride = d->dst_stride;
int pre_stride = d->pre_stride;
int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
vp8_subpix_cl_fn_t sppf;
int pre_dist = *d->base_pre - x->pre.buffer_alloc;
cl_mem pre_mem = x->pre.buffer_mem;
cl_mem dst_mem = x->dst.buffer_mem;
if (d->sixtap_filter == CL_TRUE){
sppf = vp8_sixtap_predict4x4_cl;
} else
sppf = vp8_bilinear_predict4x4_cl;
if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
{
sppf(d->cl_commands, ptr_base, pre_mem, pre_dist+ptr_offset, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, NULL, dst_mem, dst_offset, dst_stride);
}
else
{
int pre_off = pre_dist+ptr_offset;
vp8_copy_mem_cl(d->cl_commands, pre_mem,&pre_off,pre_stride, dst_mem, &dst_offset,dst_stride,4,4,1);
}
}
void vp8_build_inter_predictors_mb_s_cl(MACROBLOCKD *x)
{
cl_mem dst_mem = NULL;
cl_mem pre_mem = x->pre.buffer_mem;
unsigned char *dst_base = x->dst.buffer_alloc;
int ydst_off = x->dst.y_buffer - dst_base;
int udst_off = x->dst.u_buffer - dst_base;
int vdst_off = x->dst.v_buffer - dst_base;
dst_mem = x->dst.buffer_mem;
vp8_cl_mb_prep(x, DST_BUF);
#if !ONE_CQ_PER_MB
VP8_CL_FINISH(x->cl_commands);
#endif
if (x->mode_info_context->mbmi.mode != SPLITMV)
{
int offset;
unsigned char *pre_base = x->pre.buffer_alloc;
int ypre_off = x->pre.y_buffer - pre_base;
int upre_off = x->pre.u_buffer - pre_base;
int vpre_off = x->pre.v_buffer - pre_base;
int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
int pre_stride = x->dst.y_stride;
int ptr_offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
if ((mv_row | mv_col) & 7)
{
if (x->sixtap_filter == CL_TRUE){
vp8_sixtap_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off+ptr_offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
}
else
vp8_bilinear_predict16x16_cl(x->block[0].cl_commands, pre_base, pre_mem, ypre_off+ptr_offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
}
else
{
int pre_off = ypre_off+ptr_offset;
vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 16, 16, 1);
}
mv_row = x->block[16].bmi.mv.as_mv.row;
mv_col = x->block[16].bmi.mv.as_mv.col;
pre_stride >>= 1;
offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
if ((mv_row | mv_col) & 7)
{
if (x->sixtap_filter == CL_TRUE){
vp8_sixtap_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, udst_off, x->dst.uv_stride);
vp8_sixtap_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, vdst_off, x->dst.uv_stride);
} else {
vp8_bilinear_predict8x8_cl(x->block[16].cl_commands, pre_base, pre_mem, upre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, udst_off, x->dst.uv_stride);
vp8_bilinear_predict8x8_cl(x->block[20].cl_commands, pre_base, pre_mem, vpre_off+offset, pre_stride, mv_col & 7, mv_row & 7, dst_base, dst_mem, vdst_off, x->dst.uv_stride);
}
}
else
{
int pre_offsets[2] = {upre_off+offset, vpre_off+offset};
int dst_offsets[2] = {udst_off,vdst_off};
vp8_copy_mem_cl(x->block[16].cl_commands, pre_mem, pre_offsets, pre_stride, dst_mem, dst_offsets, x->dst.uv_stride, 8, 8, 2);
}
}
else
{
/* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
* if sth is wrong, go back to what it is in build_inter_predictors_mb.
*
* ACW: Not sure who the above comment belongs to, but it is
* accurate for the decoder. Verified by reverse trace of source
*/
int i;
if (x->mode_info_context->mbmi.partitioning < 3)
{
for (i = 0; i < 4; i++)
{
BLOCKD *d = &x->block[bbb[i]];
{
unsigned char *ptr_base = *(d->base_pre);
int pre_off = ptr_base - x->pre.buffer_alloc;
int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
pre_off += ptr_offset;
if ( (d->bmi.mv.as_mv.row | d->bmi.mv.as_mv.col) & 7)
{
if (x->sixtap_filter == CL_TRUE)
vp8_sixtap_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
else
vp8_bilinear_predict8x8_cl(d->cl_commands, ptr_base, pre_mem, pre_off, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
}
else
{
vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, d->pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 8, 8, 1);
}
}
}
}
else
{
for (i = 0; i < 16; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
{
/*vp8_build_inter_predictors2b(x, d0, 16);*/
unsigned char *ptr_base = *(d0->base_pre);
int pre_off = ptr_base - x->pre.buffer_alloc;
int ptr_offset = d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
pre_off += ptr_offset;
if ( (d0->bmi.mv.as_mv.row | d0->bmi.mv.as_mv.col) & 7)
{
if (d0->sixtap_filter == CL_TRUE)
vp8_sixtap_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
else
vp8_bilinear_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem,pre_off, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_base, dst_mem, ydst_off, x->dst.y_stride);
}
else
{
vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off, d0->pre_stride, dst_mem, &ydst_off, x->dst.y_stride, 8, 4, 1);
}
}
else
{
vp8_build_inter_predictors_b_s_cl(x,d0, ydst_off);
vp8_build_inter_predictors_b_s_cl(x,d1, ydst_off);
}
}
}
for (i = 16; i < 24; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
{
/*vp8_build_inter_predictors2b(x, d0, 8);*/
unsigned char *ptr_base = *(d0->base_pre);
int ptr_offset = d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
int pre_off = ptr_base - x->pre.buffer_alloc + ptr_offset;
if ( (d0->bmi.mv.as_mv.row | d0->bmi.mv.as_mv.col) & 7)
{
if (d0->sixtap_filter || CL_TRUE)
vp8_sixtap_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride,
d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7,
dst_base, dst_mem, ydst_off, x->dst.uv_stride);
else
vp8_bilinear_predict8x4_cl(d0->cl_commands, ptr_base, pre_mem, pre_off, d0->pre_stride,
d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7,
dst_base, dst_mem, ydst_off, x->dst.uv_stride);
}
else
{
vp8_copy_mem_cl(x->block[0].cl_commands, pre_mem, &pre_off,
d0->pre_stride, dst_mem, &ydst_off, x->dst.uv_stride, 8, 4, 1);
}
}
else
{
vp8_build_inter_predictors_b_s_cl(x,d0, ydst_off);
vp8_build_inter_predictors_b_s_cl(x,d1, ydst_off);
}
} //end for
}
#if !ONE_CQ_PER_MB
VP8_CL_FINISH(x->block[0].cl_commands);
VP8_CL_FINISH(x->block[16].cl_commands);
VP8_CL_FINISH(x->block[20].cl_commands);
#endif
vp8_cl_mb_finish(x, DST_BUF);
}

View File

@@ -0,0 +1,25 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef __INC_RECONINTER_CL_H
#define __INC_RECONINTER_CL_H
#include "blockd_cl.h"
#include "subpixel_cl.h"
#include "filter_cl.h"
extern void vp8_build_inter_predictors_mb_cl(MACROBLOCKD *x);
extern void vp8_build_inter_predictors_mbuv_cl(MACROBLOCKD *x);
extern void vp8_build_inter_predictors_mb_s_cl(MACROBLOCKD *x);
//extern void vp8_build_inter_predictors_b_cl(BLOCKD *d, int pitch);
#endif

View File

@@ -0,0 +1,46 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef SUBPIXEL_CL_H
#define SUBPIXEL_CL_H
#include "../blockd.h"
/* Note:
*
* This platform is commonly built for runtime CPU detection. If you modify
* any of the function mappings present in this file, be sure to also update
* them in the function pointer initialization code
*/
#define prototype_subpixel_predict_cl(sym) \
void sym(cl_command_queue cq, unsigned char *src_base, cl_mem src_mem, int src_offset, \
int src_pitch, int xofst, int yofst, \
unsigned char *dst_base, cl_mem dst_mem, int dst_offset, int dst_pitch)
extern prototype_subpixel_predict_cl(vp8_sixtap_predict16x16_cl);
extern prototype_subpixel_predict_cl(vp8_sixtap_predict8x8_cl);
extern prototype_subpixel_predict_cl(vp8_sixtap_predict8x4_cl);
extern prototype_subpixel_predict_cl(vp8_sixtap_predict4x4_cl);
extern prototype_subpixel_predict_cl(vp8_bilinear_predict16x16_cl);
extern prototype_subpixel_predict_cl(vp8_bilinear_predict8x8_cl);
extern prototype_subpixel_predict_cl(vp8_bilinear_predict8x4_cl);
extern prototype_subpixel_predict_cl(vp8_bilinear_predict4x4_cl);
typedef prototype_subpixel_predict_cl((*vp8_subpix_cl_fn_t));
//typedef enum
//{
// SIXTAP = 0,
// BILINEAR = 1
//} SUBPIX_TYPE;
#endif

View File

@@ -0,0 +1,342 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include "vp8_opencl.h"
int cl_initialized = VP8_CL_NOT_INITIALIZED;
VP8_COMMON_CL cl_data;
//Initialization functions for various CL programs.
extern int cl_init_filter();
extern int cl_init_idct();
extern int cl_init_loop_filter();
//Common CL destructors
extern void cl_destroy_loop_filter();
extern void cl_destroy_filter();
extern void cl_destroy_idct();
//Destructors for encoder/decoder-specific bits
extern void cl_decode_destroy();
extern void cl_encode_destroy();
/**
*
* @param cq
* @param new_status
*/
void cl_destroy(cl_command_queue cq, int new_status) {
if (cl_initialized != CL_SUCCESS)
return;
//Wait on any pending operations to complete... frees up all of our pointers
if (cq != NULL)
clFinish(cq);
#if ENABLE_CL_SUBPIXEL
//Release the objects that we've allocated on the GPU
cl_destroy_filter();
#endif
#if ENABLE_CL_IDCT_DEQUANT
cl_destroy_idct();
#if CONFIG_VP8_DECODER
if (cl_data.cl_decode_initialized == CL_SUCCESS)
cl_decode_destroy();
#endif
#endif
#if ENABLE_CL_LOOPFILTER
cl_destroy_loop_filter();
#endif
#if CONFIG_VP8_ENCODER
//placeholder for if/when encoder CL gets implemented
#endif
if (cq){
clReleaseCommandQueue(cq);
}
if (cl_data.context){
clReleaseContext(cl_data.context);
cl_data.context = NULL;
}
cl_initialized = new_status;
return;
}
/**
*
* @param dev
* @return
*/
cl_device_type device_type(cl_device_id dev){
cl_device_type type;
int err;
err = clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(type),&type,NULL);
if (err != CL_SUCCESS)
return CL_INVALID_DEVICE;
return type;
}
/**
*
* @return
*/
int cl_common_init() {
int err,i,dev;
cl_platform_id platform_ids[MAX_NUM_PLATFORMS];
cl_uint num_found, num_devices;
cl_device_id devices[MAX_NUM_DEVICES];
//Don't allow multiple CL contexts..
if (cl_initialized != VP8_CL_NOT_INITIALIZED)
return cl_initialized;
// Connect to a compute device
err = clGetPlatformIDs(MAX_NUM_PLATFORMS, platform_ids, &num_found);
if (err != CL_SUCCESS) {
fprintf(stderr, "Couldn't query platform IDs\n");
return VP8_CL_TRIED_BUT_FAILED;
}
if (num_found == 0) {
fprintf(stderr, "No platforms found\n");
return VP8_CL_TRIED_BUT_FAILED;
}
//printf("Enumerating %d platform(s)\n", num_found);
//Enumerate the platforms found
for (i = 0; i < num_found; i++){
char buf[2048];
size_t len;
err = clGetPlatformInfo( platform_ids[i], CL_PLATFORM_VENDOR, sizeof(buf), buf, &len);
if (err != CL_SUCCESS){
fprintf(stderr, "Error retrieving platform vendor for platform %d",i);
continue;
}
//printf("Platform %d: %s\n",i,buf);
//If you need to force a platform (e.g. CPU-only testing), uncomment this
//if (strstr(buf,"NVIDIA"))
// continue;
//Try to find a valid compute device
//Favor the GPU, but fall back to any other available device if necessary
#ifdef __APPLE__
printf("Apple system. Running CL as CPU-only for now...\n");
err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_CPU, MAX_NUM_DEVICES, devices, &num_devices);
#else
err = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, MAX_NUM_DEVICES, devices, &num_devices);
#endif //__APPLE__
//printf("found %d devices\n", num_devices);
cl_data.device_id = NULL;
for( dev = 0; dev < num_devices; dev++ ){
char ext[2048];
//Get info for this device.
err = clGetDeviceInfo(devices[dev], CL_DEVICE_EXTENSIONS,
sizeof(ext),ext,NULL);
VP8_CL_CHECK_SUCCESS(NULL,err != CL_SUCCESS,
"Error retrieving device extension list",continue, 0);
//printf("Device %d supports: %s\n",dev,ext);
//The kernels in VP8 require byte-addressable stores, which is an
//extension. It's required in OpenCL 1.1, but not all devices
//support it.
if (strstr(ext,"cl_khr_byte_addressable_store")){
//We found a valid device, so use it. But if we find a GPU
//(maybe this is one), prefer that.
cl_data.device_id = devices[dev];
if ( device_type(devices[dev]) == CL_DEVICE_TYPE_GPU ){
//printf("Device %d is a GPU\n",dev);
break;
}
}
}
//If we've found a usable GPU, stop looking.
if (cl_data.device_id != NULL && device_type(cl_data.device_id) == CL_DEVICE_TYPE_GPU )
break;
}
if (cl_data.device_id == NULL){
printf("Error: Failed to find a valid OpenCL device. Using CPU paths\n");
return VP8_CL_TRIED_BUT_FAILED;
}
// Create the compute context
cl_data.context = clCreateContext(0, 1, &cl_data.device_id, NULL, NULL, &err);
if (!cl_data.context) {
printf("Error: Failed to create a compute context!\n");
return VP8_CL_TRIED_BUT_FAILED;
}
//Initialize programs to null value
//Enables detection of if they've been initialized as well.
cl_data.filter_program = NULL;
cl_data.idct_program = NULL;
cl_data.loop_filter_program = NULL;
#if ENABLE_CL_SUBPIXEL
err = cl_init_filter();
if (err != CL_SUCCESS)
return err;
#endif
#if ENABLE_CL_IDCT_DEQUANT
err = cl_init_idct();
if (err != CL_SUCCESS)
return err;
#endif
#if ENABLE_CL_LOOPFILTER
err = cl_init_loop_filter();
if (err != CL_SUCCESS)
return err;
#endif
return CL_SUCCESS;
}
char *cl_read_file(const char* file_name) {
long pos;
char *bytes;
size_t amt_read;
FILE *f;
f = fopen(file_name, "rb");
if (f == NULL) {
char *fullpath;
//printf("Couldn't find %s\n", file_name);
//Generate a file path for the CL sources using the library install dir
fullpath = malloc(strlen(vpx_codec_lib_dir()) + strlen(file_name) + 2);
if (fullpath == NULL) {
return NULL;
}
strcpy(fullpath, vpx_codec_lib_dir());
strcat(fullpath, "/"); //Will need to be changed for MSVS
strcat(fullpath, file_name);
//printf("Looking in %s\n", fullpath);
f = fopen(fullpath, "rb");
if (f == NULL) {
fprintf(stderr,"Couldn't find CL source at %s or %s\n", file_name, fullpath);
free(fullpath);
return NULL;
}
//printf("Found cl source at %s\n", fullpath);
free(fullpath);
} else {
//printf("Found cl source at %s\n", file_name);
}
fseek(f, 0, SEEK_END);
pos = ftell(f);
fseek(f, 0, SEEK_SET);
bytes = malloc(pos+1);
if (bytes == NULL) {
fclose(f);
return NULL;
}
amt_read = fread(bytes, pos, 1, f);
if (amt_read != 1) {
free(bytes);
fclose(f);
return NULL;
}
bytes[pos] = '\0'; //null terminate the source string
fclose(f);
return bytes;
}
void show_build_log(cl_program *prog_ref){
size_t len;
char *buffer;
int err = clGetProgramBuildInfo(*prog_ref, cl_data.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
if (err != CL_SUCCESS){
printf("Error: Could not get length of CL build log\n");
}
buffer = (char*) malloc(len);
if (buffer == NULL) {
printf("Error: Couldn't allocate compile output buffer memory\n");
}
err = clGetProgramBuildInfo(*prog_ref, cl_data.device_id, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
if (err != CL_SUCCESS) {
printf("Error: Could not get CL build log\n");
} else {
printf("Compile output: %s\n", buffer);
}
free(buffer);
}
int cl_load_program(cl_program *prog_ref, const char *file_name, const char *opts) {
int err;
char *kernel_src = cl_read_file(file_name);
*prog_ref = NULL;
if (kernel_src != NULL) {
*prog_ref = clCreateProgramWithSource(cl_data.context, 1, (const char**)&kernel_src, NULL, &err);
free(kernel_src);
} else {
cl_destroy(NULL, VP8_CL_TRIED_BUT_FAILED);
printf("Couldn't find OpenCL source files. \nUsing software path.\n");
return VP8_CL_TRIED_BUT_FAILED;
}
if (*prog_ref == NULL) {
printf("Error: Couldn't create program\n");
return VP8_CL_TRIED_BUT_FAILED;
}
if (err != CL_SUCCESS) {
printf("Error creating program: %d\n", err);
}
/* Build the program executable */
err = clBuildProgram(*prog_ref, 0, NULL, opts, NULL, NULL);
if (err != CL_SUCCESS) {
printf("Error: Failed to build program executable for %s!\n", file_name);
show_build_log(prog_ref);
return VP8_CL_TRIED_BUT_FAILED;
}
return CL_SUCCESS;
}

View File

@@ -0,0 +1,192 @@
/*
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VP8_OPENCL_H
#define VP8_OPENCL_H
#ifdef __cplusplus
extern "C" {
#endif
#include "../../../vpx_config.h"
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
#if HAVE_DLOPEN
#include "dynamic_cl.h"
#endif
#define ENABLE_CL_IDCT_DEQUANT 0
#define ENABLE_CL_SUBPIXEL 1
#define TWO_PASS_SIXTAP 0
#define MEM_COPY_KERNEL 1
#define ONE_CQ_PER_MB 1 //Value of 0 is racey... still experimental.
#define ENABLE_CL_LOOPFILTER 0
extern char *cl_read_file(const char* file_name);
extern int cl_common_init();
extern void cl_destroy(cl_command_queue cq, int new_status);
extern int cl_load_program(cl_program *prog_ref, const char *file_name, const char *opts);
#define MAX_NUM_PLATFORMS 4
#define MAX_NUM_DEVICES 10
#define VP8_CL_TRIED_BUT_FAILED 1
#define VP8_CL_NOT_INITIALIZED -1
extern int cl_initialized;
extern const char *vpx_codec_lib_dir(void);
#define VP8_CL_FINISH(cq) \
if (cl_initialized == CL_SUCCESS){ \
/* Wait for kernels to finish. */ \
clFinish(cq); \
}
#define VP8_CL_BARRIER(cq) \
if (cl_initialized == CL_SUCCESS){ \
/* Insert a barrier into the command queue. */ \
clEnqueueBarrier(cq); \
}
#define VP8_CL_CHECK_SUCCESS(cq,cond,msg,alt,retCode) \
if ( cond ){ \
fprintf(stderr, msg); \
cl_destroy(cq, VP8_CL_TRIED_BUT_FAILED); \
alt; \
return retCode; \
}
#define VP8_CL_CALC_LOCAL_SIZE(kernel, kernel_size) \
err = clGetKernelWorkGroupInfo( cl_data.kernel, \
cl_data.device_id, \
CL_KERNEL_WORK_GROUP_SIZE, \
sizeof(size_t), \
&cl_data.kernel_size, \
NULL);\
VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS, \
"Error: Failed to calculate local size of kernel!\n", \
,\
VP8_CL_TRIED_BUT_FAILED \
); \
#define VP8_CL_CREATE_KERNEL(data,program,name,str_name) \
data.name = clCreateKernel(data.program, str_name , &err); \
VP8_CL_CHECK_SUCCESS(NULL, err != CL_SUCCESS || !data.name, \
"Error: Failed to create compute kernel "#str_name"!\n", \
,\
VP8_CL_TRIED_BUT_FAILED \
);
#define VP8_CL_READ_BUF(cq, bufRef, bufSize, dstPtr) \
err = clEnqueueReadBuffer(cq, bufRef, CL_FALSE, 0, bufSize , dstPtr, 0, NULL, NULL); \
VP8_CL_CHECK_SUCCESS( cq, err != CL_SUCCESS, \
"Error: Failed to read from GPU!\n",, err \
); \
#define VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode) \
{ \
err = clEnqueueWriteBuffer(cq, bufRef, CL_FALSE, 0, \
bufSize, dataPtr, 0, NULL, NULL); \
\
VP8_CL_CHECK_SUCCESS(cq, err != CL_SUCCESS, \
"Error: Failed to write to buffer!\n", \
altPath, retCode\
); \
} \
#define VP8_CL_CREATE_BUF(cq, bufRef, bufType, bufSize, dataPtr, altPath, retCode) \
bufRef = clCreateBuffer(cl_data.context, CL_MEM_READ_WRITE, bufSize, NULL, NULL); \
if (dataPtr != NULL && bufRef != NULL){ \
VP8_CL_SET_BUF(cq, bufRef, bufSize, dataPtr, altPath, retCode)\
} \
VP8_CL_CHECK_SUCCESS(cq, !bufRef, \
"Error: Failed to allocate buffer. Using CPU path!\n", \
altPath, retCode\
); \
#define VP8_CL_RELEASE_KERNEL(kernel) \
if (kernel) \
clReleaseKernel(kernel); \
kernel = NULL;
typedef struct VP8_COMMON_CL {
cl_device_id device_id; // compute device id
cl_context context; // compute context
//cl_command_queue commands; // compute command queue
cl_program filter_program; // compute program for subpixel/bilinear filters
cl_kernel vp8_sixtap_predict_kernel;
size_t vp8_sixtap_predict_kernel_size;
cl_kernel vp8_sixtap_predict8x4_kernel;
size_t vp8_sixtap_predict8x4_kernel_size;
cl_kernel vp8_sixtap_predict8x8_kernel;
size_t vp8_sixtap_predict8x8_kernel_size;
cl_kernel vp8_sixtap_predict16x16_kernel;
size_t vp8_sixtap_predict16x16_kernel_size;
cl_kernel vp8_bilinear_predict4x4_kernel;
cl_kernel vp8_bilinear_predict8x4_kernel;
cl_kernel vp8_bilinear_predict8x8_kernel;
cl_kernel vp8_bilinear_predict16x16_kernel;
cl_kernel vp8_filter_block2d_first_pass_kernel;
size_t vp8_filter_block2d_first_pass_kernel_size;
cl_kernel vp8_filter_block2d_second_pass_kernel;
size_t vp8_filter_block2d_second_pass_kernel_size;
cl_kernel vp8_filter_block2d_bil_first_pass_kernel;
size_t vp8_filter_block2d_bil_first_pass_kernel_size;
cl_kernel vp8_filter_block2d_bil_second_pass_kernel;
size_t vp8_filter_block2d_bil_second_pass_kernel_size;
cl_kernel vp8_memcpy_kernel;
size_t vp8_memcpy_kernel_size;
cl_kernel vp8_memset_short_kernel;
cl_program idct_program;
cl_kernel vp8_short_inv_walsh4x4_1_kernel;
cl_kernel vp8_short_inv_walsh4x4_1st_pass_kernel;
cl_kernel vp8_short_inv_walsh4x4_2nd_pass_kernel;
cl_kernel vp8_dc_only_idct_add_kernel;
//Note that the following 2 kernels are encoder-only. Not used in decoder.
cl_kernel vp8_short_idct4x4llm_1_kernel;
cl_kernel vp8_short_idct4x4llm_kernel;
cl_program loop_filter_program;
cl_kernel vp8_loop_filter_horizontal_edge_kernel;
cl_kernel vp8_loop_filter_vertical_edge_kernel;
cl_kernel vp8_mbloop_filter_horizontal_edge_kernel;
cl_kernel vp8_mbloop_filter_vertical_edge_kernel;
cl_kernel vp8_loop_filter_simple_horizontal_edge_kernel;
cl_kernel vp8_loop_filter_simple_vertical_edge_kernel;
cl_program dequant_program;
cl_kernel vp8_dequant_dc_idct_add_kernel;
cl_kernel vp8_dequant_idct_add_kernel;
cl_kernel vp8_dequantize_b_kernel;
cl_int cl_decode_initialized;
cl_int cl_encode_initialized;
} VP8_COMMON_CL;
extern VP8_COMMON_CL cl_data;
#ifdef __cplusplus
}
#endif
#endif /* VP8_OPENCL_H */

View File

@@ -804,14 +804,11 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
for (j = 0; j < mb_cols; j++)
{
char zz[4];
int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED &&
mi[mb_index].mbmi.mode != SPLITMV &&
mi[mb_index].mbmi.mb_skip_coeff);
if (oci->frame_type == KEY_FRAME)
sprintf(zz, "a");
else
sprintf(zz, "%c", dc_diff + '0');
sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0');
vp8_blit_text(zz, y_ptr, post->y_stride);
mb_index ++;
@@ -837,6 +834,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer;
int width = post->y_width;
int height = post->y_height;
int mb_cols = width >> 4;
unsigned char *y_buffer = oci->post_proc_buffer.y_buffer;
int y_stride = oci->post_proc_buffer.y_stride;
MODE_INFO *mi = oci->mi;
@@ -860,7 +858,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
{
case 0 : /* mv_top_bottom */
{
union b_mode_info *bmi = &mi->bmi[0];
B_MODE_INFO *bmi = &mi->bmi[0];
MV *mv = &bmi->mv.as_mv;
x1 = x0 + 8 + (mv->col >> 3);
@@ -881,7 +879,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
}
case 1 : /* mv_left_right */
{
union b_mode_info *bmi = &mi->bmi[0];
B_MODE_INFO *bmi = &mi->bmi[0];
MV *mv = &bmi->mv.as_mv;
x1 = x0 + 4 + (mv->col >> 3);
@@ -902,7 +900,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
}
case 2 : /* mv_quarters */
{
union b_mode_info *bmi = &mi->bmi[0];
B_MODE_INFO *bmi = &mi->bmi[0];
MV *mv = &bmi->mv.as_mv;
x1 = x0 + 4 + (mv->col >> 3);
@@ -938,7 +936,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
}
default :
{
union b_mode_info *bmi = mi->bmi;
B_MODE_INFO *bmi = mi->bmi;
int bx0, by0;
for (by0 = y0; by0 < (y0+16); by0 += 4)
@@ -1011,7 +1009,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
{
int by, bx;
unsigned char *yl, *ul, *vl;
union b_mode_info *bmi = mi->bmi;
B_MODE_INFO *bmi = mi->bmi;
yl = y_ptr + x;
ul = u_ptr + (x>>1);
@@ -1024,9 +1022,9 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
if ((ppflags->display_b_modes_flag & (1<<mi->mbmi.mode))
|| (ppflags->display_mb_modes_flag & B_PRED))
{
Y = B_PREDICTION_MODE_colors[bmi->as_mode][0];
U = B_PREDICTION_MODE_colors[bmi->as_mode][1];
V = B_PREDICTION_MODE_colors[bmi->as_mode][2];
Y = B_PREDICTION_MODE_colors[bmi->mode][0];
U = B_PREDICTION_MODE_colors[bmi->mode][1];
V = B_PREDICTION_MODE_colors[bmi->mode][2];
POSTPROC_INVOKE(RTCD_VTABLE(oci), blend_b)
(yl+bx, ul+(bx>>1), vl+(bx>>1), Y, U, V, 0xc000, y_stride);

View File

@@ -53,8 +53,9 @@ loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
// Horizontal MB filtering
void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void)simpler_lpf;
mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
if (u_ptr)
@@ -62,8 +63,9 @@ void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
}
void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void)simpler_lpf;
(void)u_ptr;
(void)v_ptr;
(void)uv_stride;
@@ -72,8 +74,9 @@ void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
// Vertical MB Filtering
void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void)simpler_lpf;
mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
if (u_ptr)
@@ -81,8 +84,9 @@ void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
}
void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void)simpler_lpf;
(void)u_ptr;
(void)v_ptr;
(void)uv_stride;
@@ -91,8 +95,9 @@ void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c
// Horizontal B Filtering
void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void)simpler_lpf;
// These should all be done at once with one call, instead of 3
loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
@@ -103,8 +108,9 @@ void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
}
void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void)simpler_lpf;
(void)u_ptr;
(void)v_ptr;
(void)uv_stride;
@@ -115,8 +121,9 @@ void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch
// Vertical B Filtering
void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void)simpler_lpf;
loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
if (u_ptr)
@@ -124,8 +131,9 @@ void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha
}
void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf)
{
(void)simpler_lpf;
(void)u_ptr;
(void)v_ptr;
(void)uv_stride;

View File

@@ -66,6 +66,7 @@ int vp8_dc2quant(int QIndex, int Delta)
return retval;
}
int vp8_dc_uv_quant(int QIndex, int Delta)
{
int retval;
@@ -116,6 +117,7 @@ int vp8_ac2quant(int QIndex, int Delta)
return retval;
}
int vp8_ac_uv_quant(int QIndex, int Delta)
{
int retval;

View File

@@ -110,19 +110,19 @@ void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
#if ARCH_ARM
BLOCKD *b = &x->block[0];
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
/*b = &x->block[4];*/
b += 4;
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
/*b = &x->block[8];*/
b += 4;
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
/*b = &x->block[12];*/
b += 4;
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
#else
int i;
@@ -130,7 +130,7 @@ void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
BLOCKD *b = &x->block[i];
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
}
#endif
}
@@ -140,27 +140,27 @@ void vp8_recon_mb_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
#if ARCH_ARM
BLOCKD *b = &x->block[0];
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b += 4;
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b += 4;
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b += 4;
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b += 4;
/*b = &x->block[16];*/
RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b++;
b++;
RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b++;
b++;
RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
b++;
b++;
RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
#else
int i;
@@ -168,14 +168,14 @@ void vp8_recon_mb_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
{
BLOCKD *b = &x->block[i];
RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon4)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
}
for (i = 16; i < 24; i += 2)
{
BLOCKD *b = &x->block[i];
RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
}
#endif
}

View File

@@ -26,9 +26,6 @@
#define prototype_build_intra_predictors(sym) \
void sym(MACROBLOCKD *x)
#define prototype_intra4x4_predict(sym) \
void sym(BLOCKD *x, int b_mode, unsigned char *predictor)
struct vp8_recon_rtcd_vtable;
#if ARCH_X86 || ARCH_X86_64
@@ -91,30 +88,11 @@ extern prototype_build_intra_predictors\
extern prototype_build_intra_predictors\
(vp8_recon_build_intra_predictors_mby_s);
#ifndef vp8_recon_build_intra_predictors_mbuv
#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv
#endif
extern prototype_build_intra_predictors\
(vp8_recon_build_intra_predictors_mbuv);
#ifndef vp8_recon_build_intra_predictors_mbuv_s
#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s
#endif
extern prototype_build_intra_predictors\
(vp8_recon_build_intra_predictors_mbuv_s);
#ifndef vp8_recon_intra4x4_predict
#define vp8_recon_intra4x4_predict vp8_intra4x4_predict
#endif
extern prototype_intra4x4_predict\
(vp8_recon_intra4x4_predict);
typedef prototype_copy_block((*vp8_copy_block_fn_t));
typedef prototype_recon_block((*vp8_recon_fn_t));
typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
typedef struct vp8_recon_rtcd_vtable
{
vp8_copy_block_fn_t copy16x16;
@@ -127,9 +105,6 @@ typedef struct vp8_recon_rtcd_vtable
vp8_recon_mb_fn_t recon_mby;
vp8_build_intra_pred_fn_t build_intra_predictors_mby_s;
vp8_build_intra_pred_fn_t build_intra_predictors_mby;
vp8_build_intra_pred_fn_t build_intra_predictors_mbuv_s;
vp8_build_intra_pred_fn_t build_intra_predictors_mbuv;
vp8_intra4x4_pred_fn_t intra4x4_predict;
} vp8_recon_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT

View File

@@ -8,9 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "vpx_ports/config.h"
#include "vpx/vpx_integer.h"
#include "recon.h"
#include "subpixel.h"
#include "blockd.h"
@@ -19,10 +17,22 @@
#include "onyxc_int.h"
#endif
#if CONFIG_OPENCL
#include "opencl/vp8_opencl.h"
#include "opencl/filter_cl.h"
#include "opencl/reconinter_cl.h"
#endif
/* use this define on systems where unaligned int reads and writes are
* not allowed, i.e. ARM architectures
*/
/*#define MUST_BE_ALIGNED*/
static const int bbb[4] = {0, 2, 8, 10};
//Copy 16 x 16-bytes from src to dst.
void vp8_copy_mem16x16_c(
unsigned char *src,
int src_stride,
@@ -32,9 +42,12 @@ void vp8_copy_mem16x16_c(
int r;
//Set this up as a 2D kernel. Each loop iteration is X, each byte/int within
//is the Y address.
for (r = 0; r < 16; r++)
{
#if !(CONFIG_FAST_UNALIGNED)
#ifdef MUST_BE_ALIGNED
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
@@ -53,10 +66,10 @@ void vp8_copy_mem16x16_c(
dst[15] = src[15];
#else
((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
((uint32_t *)dst)[2] = ((uint32_t *)src)[2] ;
((uint32_t *)dst)[3] = ((uint32_t *)src)[3] ;
((int *)dst)[0] = ((int *)src)[0] ;
((int *)dst)[1] = ((int *)src)[1] ;
((int *)dst)[2] = ((int *)src)[2] ;
((int *)dst)[3] = ((int *)src)[3] ;
#endif
src += src_stride;
@@ -66,6 +79,7 @@ void vp8_copy_mem16x16_c(
}
//Copy 8 x 8-bytes
void vp8_copy_mem8x8_c(
unsigned char *src,
int src_stride,
@@ -76,7 +90,7 @@ void vp8_copy_mem8x8_c(
for (r = 0; r < 8; r++)
{
#if !(CONFIG_FAST_UNALIGNED)
#ifdef MUST_BE_ALIGNED
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
@@ -86,8 +100,8 @@ void vp8_copy_mem8x8_c(
dst[6] = src[6];
dst[7] = src[7];
#else
((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
((int *)dst)[0] = ((int *)src)[0] ;
((int *)dst)[1] = ((int *)src)[1] ;
#endif
src += src_stride;
dst += dst_stride;
@@ -106,7 +120,7 @@ void vp8_copy_mem8x4_c(
for (r = 0; r < 4; r++)
{
#if !(CONFIG_FAST_UNALIGNED)
#ifdef MUST_BE_ALIGNED
dst[0] = src[0];
dst[1] = src[1];
dst[2] = src[2];
@@ -116,8 +130,8 @@ void vp8_copy_mem8x4_c(
dst[6] = src[6];
dst[7] = src[7];
#else
((uint32_t *)dst)[0] = ((uint32_t *)src)[0] ;
((uint32_t *)dst)[1] = ((uint32_t *)src)[1] ;
((int *)dst)[0] = ((int *)src)[0] ;
((int *)dst)[1] = ((int *)src)[1] ;
#endif
src += src_stride;
dst += dst_stride;
@@ -131,34 +145,32 @@ void vp8_copy_mem8x4_c(
void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
{
int r;
unsigned char *ptr_base;
unsigned char *ptr;
unsigned char *pred_ptr = d->predictor;
ptr_base = *(d->base_pre);
//d->base_pre is the start of the previous frame's y_buffer, u_buffer, or v_buffer
unsigned char *ptr_base = *(d->base_pre);
int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
sppf(ptr_base+ptr_offset, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
}
else
{
ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
ptr = ptr_base;
for (r = 0; r < 4; r++)
{
#if !(CONFIG_FAST_UNALIGNED)
pred_ptr[0] = ptr[0];
pred_ptr[1] = ptr[1];
pred_ptr[2] = ptr[2];
pred_ptr[3] = ptr[3];
#ifdef MUST_BE_ALIGNED
pred_ptr[0] = ptr_base[ptr_offset];
pred_ptr[1] = ptr_base[ptr_offset+1];
pred_ptr[2] = ptr_base[ptr_offset+2];
pred_ptr[3] = ptr_base[ptr_offset+3];
#else
*(uint32_t *)pred_ptr = *(uint32_t *)ptr ;
*(int *)pred_ptr = *(int *)(ptr_base+ptr_offset) ;
#endif
pred_ptr += pitch;
ptr += d->pre_stride;
ptr_offset += d->pre_stride;
}
}
}
@@ -167,7 +179,7 @@ static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
{
unsigned char *ptr_base;
unsigned char *ptr;
unsigned char *pred_ptr = d->predictor;
unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
@@ -186,7 +198,7 @@ static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
{
unsigned char *ptr_base;
unsigned char *ptr;
unsigned char *pred_ptr = d->predictor;
unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
@@ -201,13 +213,24 @@ static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
}
}
/*encoder only*/
/* Encoder only */
void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
{
int i;
if (x->mode_info_context->mbmi.mode != SPLITMV)
#if CONFIG_OPENCL
if ( 0 && cl_initialized == CL_SUCCESS ){
vp8_build_inter_predictors_mbuv_cl(x);
VP8_CL_FINISH(x->cl_commands);
VP8_CL_FINISH(x->block[0].cl_commands);
VP8_CL_FINISH(x->block[16].cl_commands);
VP8_CL_FINISH(x->block[20].cl_commands);
return;
}
#endif
if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
x->mode_info_context->mbmi.mode != SPLITMV)
{
unsigned char *uptr, *vptr;
unsigned char *upred_ptr = &x->predictor[256];
@@ -252,7 +275,11 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
}
/*encoder only*/
void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
void vp8_build_inter_predictors_mby(MACROBLOCKD *x)
{
if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
x->mode_info_context->mbmi.mode != SPLITMV)
{
unsigned char *ptr_base;
unsigned char *ptr;
@@ -273,33 +300,67 @@ void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
}
}
else
{
int i;
void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
unsigned char *dst_y,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_ystride,
int dst_uvstride)
if (x->mode_info_context->mbmi.partitioning < 3)
{
for (i = 0; i < 4; i++)
{
BLOCKD *d = &x->block[bbb[i]];
build_inter_predictors4b(x, d, 16);
}
}
else
{
for (i = 0; i < 16; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
build_inter_predictors2b(x, d0, 16);
else
{
vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
}
}
}
}
}
void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
{
if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME &&
x->mode_info_context->mbmi.mode != SPLITMV)
{
int offset;
unsigned char *ptr_base;
unsigned char *ptr;
unsigned char *uptr, *vptr;
unsigned char *pred_ptr = x->predictor;
unsigned char *upred_ptr = &x->predictor[256];
unsigned char *vpred_ptr = &x->predictor[320];
int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
unsigned char *ptr_base = x->pre.y_buffer;
int pre_stride = x->block[0].pre_stride;
ptr_base = x->pre.y_buffer;
ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
if ((mv_row | mv_col) & 7)
{
x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_y, dst_ystride);
x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16);
}
else
{
RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y, dst_ystride);
RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16);
}
mv_row = x->block[16].bmi.mv.as_mv.row;
@@ -311,18 +372,16 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
if ((mv_row | mv_col) & 7)
{
x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, dst_u, dst_uvstride);
x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, dst_v, dst_uvstride);
x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
}
else
{
RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, dst_u, dst_uvstride);
RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, dst_v, dst_uvstride);
RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8);
RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8);
}
}
void vp8_build_inter4x4_predictors_mb(MACROBLOCKD *x)
else
{
int i;
@@ -365,19 +424,9 @@ void vp8_build_inter4x4_predictors_mb(MACROBLOCKD *x)
vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
}
}
}
void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
{
if (x->mode_info_context->mbmi.mode != SPLITMV)
{
vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256],
&x->predictor[320], 16, 8);
}
else
{
vp8_build_inter4x4_predictors_mb(x);
}
}
@@ -461,5 +510,202 @@ void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
}
/* The following functions are written for skip_recon_mb() to call. Since there is no recon in this
* situation, we can write the result directly to dst buffer instead of writing it to predictor
* buffer and then copying it to dst buffer.
*/
static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp8_subpix_fn_t sppf)
{
int r;
unsigned char *ptr_base;
unsigned char *ptr;
/*unsigned char *pred_ptr = d->predictor_base + d->predictor_offset;*/
int dst_stride = d->dst_stride;
int pre_stride = d->pre_stride;
int ptr_offset = d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
ptr_base = *(d->base_pre);
ptr = ptr_base + ptr_offset;
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, dst_stride);
}
else
{
for (r = 0; r < 4; r++)
{
#ifdef MUST_BE_ALIGNED
dst_ptr[0] = ptr[0];
dst_ptr[1] = ptr[1];
dst_ptr[2] = ptr[2];
dst_ptr[3] = ptr[3];
#else
*(int *)dst_ptr = *(int *)ptr ;
#endif
dst_ptr += dst_stride;
ptr += pre_stride;
}
}
}
void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x)
{
unsigned char *dst_ptr = x->dst.y_buffer;
#if CONFIG_OPENCL && ENABLE_CL_SUBPIXEL
if (cl_initialized == CL_SUCCESS){
vp8_build_inter_predictors_mb_s_cl(x);
return;
}
#endif
if (x->mode_info_context->mbmi.mode != SPLITMV)
{
int offset;
unsigned char *ptr_base;
unsigned char *ptr;
unsigned char *uptr, *vptr;
/*unsigned char *pred_ptr = x->predictor;
unsigned char *upred_ptr = &x->predictor[256];
unsigned char *vpred_ptr = &x->predictor[320];*/
unsigned char *udst_ptr = x->dst.u_buffer;
unsigned char *vdst_ptr = x->dst.v_buffer;
int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
int pre_stride = x->dst.y_stride; /*x->block[0].pre_stride;*/
ptr_base = x->pre.y_buffer;
ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
if ((mv_row | mv_col) & 7)
{
x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
}
else
{
RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
}
mv_row = x->block[16].bmi.mv.as_mv.row;
mv_col = x->block[16].bmi.mv.as_mv.col;
pre_stride >>= 1;
offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
uptr = x->pre.u_buffer + offset;
vptr = x->pre.v_buffer + offset;
if ((mv_row | mv_col) & 7)
{
x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride);
x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride);
}
else
{
RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, udst_ptr, x->dst.uv_stride);
RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride);
}
}
else
{
/* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later,
* if sth is wrong, go back to what it is in build_inter_predictors_mb.
*
* ACW: note: Not sure who the above comment belongs to.
*/
int i;
if (x->mode_info_context->mbmi.partitioning < 3)
{
for (i = 0; i < 4; i++)
{
BLOCKD *d = &x->block[bbb[i]];
/*build_inter_predictors4b(x, d, 16);*/
{
unsigned char *ptr_base;
unsigned char *ptr;
ptr_base = *(d->base_pre);
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
{
x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
}
else
{
RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/
}
}
}
}
else
{
for (i = 0; i < 16; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
{
/*build_inter_predictors2b(x, d0, 16);*/
unsigned char *ptr_base;
unsigned char *ptr;
ptr_base = *(d0->base_pre);
ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
{
x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride);
}
else
{
RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride);
}
}
else
{
vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
}
}
}
for (i = 16; i < 24; i += 2)
{
BLOCKD *d0 = &x->block[i];
BLOCKD *d1 = &x->block[i+1];
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
{
/*build_inter_predictors2b(x, d0, 8);*/
unsigned char *ptr_base;
unsigned char *ptr;
ptr_base = *(d0->base_pre);
ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3);
if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7)
{
x->subpixel_predict8x4(ptr, d0->pre_stride,
d0->bmi.mv.as_mv.col & 7,
d0->bmi.mv.as_mv.row & 7,
dst_ptr, x->dst.uv_stride);
}
else
{
RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr,
d0->pre_stride, dst_ptr, x->dst.uv_stride);
}
}
else
{
vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict);
vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict);
}
}
}
}

View File

@@ -13,15 +13,9 @@
#define __INC_RECONINTER_H
extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x,
unsigned char *dst_y,
unsigned char *dst_u,
unsigned char *dst_v,
int dst_ystride,
int dst_uvstride);
extern void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x);
extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
extern void vp8_build_inter_predictors_mby(MACROBLOCKD *x);
extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
extern void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x);

View File

@@ -24,7 +24,7 @@ void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
for (i = 16; i < 24; i += 2)
{
BLOCKD *b = &x->block[i];
RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
RECON_INVOKE(rtcd, recon2)(b->predictor_base + b->predictor_offset, &b->diff_base[b->diff_offset], *(b->base_dst) + b->dst, b->dst_stride);
}
}

View File

@@ -14,4 +14,9 @@
extern void init_intra_left_above_pixels(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x);
extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x);
extern void vp8_predict_intra4x4(BLOCKD *x, int b_mode, unsigned char *Predictor);
#endif

View File

@@ -14,7 +14,7 @@
#include "vpx_mem/vpx_mem.h"
#include "reconintra.h"
void vp8_intra4x4_predict(BLOCKD *x,
void vp8_predict_intra4x4(BLOCKD *x,
int b_mode,
unsigned char *predictor)
{
@@ -124,6 +124,18 @@ void vp8_intra4x4_predict(BLOCKD *x,
case B_LD_PRED:
{
unsigned char *ptr = Above;
#if 0
//More readable version of the unrolled loop
int stride = 16, r=0, c=0;
for (r=0; r < 4; r++){
for (c=0; c < 4; c++){
int off = r+c;
int off2 = off > 5 ? 5: off; //Clamp so [3,3] has max off2 of 7
predictor[r*stride+c] = (ptr[off] + ptr[off+1]*2 + ptr[off2+2] + 2)>>2;
}
}
#else
predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
predictor[0 * 16 + 1] =
predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
@@ -140,6 +152,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
predictor[2 * 16 + 3] =
predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
#endif
}
break;
@@ -311,5 +324,3 @@ void vp8_intra_prediction_down_copy(MACROBLOCKD *x)
*dst_ptr1 = *src_ptr;
*dst_ptr2 = *src_ptr;
}

Some files were not shown because too many files have changed in this diff Show More