Merge pull request #387 from zhilwang/arm-asm

Arm asm
This commit is contained in:
volvet 2014-03-04 11:08:17 +08:00
commit f8b0cec68d
19 changed files with 3897 additions and 15 deletions

View File

@ -19,6 +19,8 @@
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473818BC61650017DF25 /* deblocking_common.cpp */; };
4CE4475218BC61650017DF25 /* logging.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4473C18BC61650017DF25 /* logging.cpp */; };
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */; };
4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BC18C085320017DF25 /* deblocking_neon.S */; };
4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */; };
/* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */
@ -69,6 +71,8 @@
4CE4474718BC61650017DF25 /* typedefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = typedefs.h; sourceTree = "<group>"; };
4CE4474918BC61650017DF25 /* WelsThreadLib.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = WelsThreadLib.cpp; sourceTree = "<group>"; };
4CE4474A18BC61650017DF25 /* WelsThreadLib.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = WelsThreadLib.h; sourceTree = "<group>"; };
4CE447BC18C085320017DF25 /* deblocking_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.asm; path = deblocking_neon.S; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = arm_arch_common_macro.S; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -144,6 +148,8 @@
4CE4472F18BC61650017DF25 /* common */ = {
isa = PBXGroup;
children = (
4CE447BE18C085900017DF25 /* arm_arch_common_macro.S */,
4CE447BC18C085320017DF25 /* deblocking_neon.S */,
4CE4473118BC61650017DF25 /* cpu.cpp */,
4CE4473218BC61650017DF25 /* cpu.h */,
4CE4473318BC61650017DF25 /* cpu_core.h */,
@ -247,9 +253,11 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
4CE447BF18C085900017DF25 /* arm_arch_common_macro.S in Sources */,
4CE4475018BC61650017DF25 /* deblocking_common.cpp in Sources */,
4CE4474C18BC61650017DF25 /* cpu.cpp in Sources */,
4CE4475218BC61650017DF25 /* logging.cpp in Sources */,
4CE447BD18C085320017DF25 /* deblocking_neon.S in Sources */,
4CE4475818BC61650017DF25 /* WelsThreadLib.cpp in Sources */,
4CE4474E18BC61650017DF25 /* crt_util_safe_x.cpp in Sources */,
);

View File

@ -36,6 +36,9 @@
4CE4469D18BC5EAB0017DF25 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4467A18BC5EAA0017DF25 /* utils.cpp */; };
4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */; };
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */; };
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A718BC6BE90017DF25 /* block_add_neon.S */; };
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */; };
4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = 4CE447AA18BC6BE90017DF25 /* mc_neon.S */; };
/* End PBXBuildFile section */
/* Begin PBXContainerItemProxy section */
@ -81,9 +84,9 @@
4CE4464E18BC5EAA0017DF25 /* decoder_context.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_context.h; sourceTree = "<group>"; };
4CE4464F18BC5EAA0017DF25 /* decoder_core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = decoder_core.h; sourceTree = "<group>"; };
4CE4465018BC5EAA0017DF25 /* error_code.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = error_code.h; sourceTree = "<group>"; };
4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; };
4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; };
4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; };
4CE4465118BC5EAA0017DF25 /* expand_pic.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = expand_pic.h; sourceTree = "<group>"; usesTabs = 1; };
4CE4465218BC5EAA0017DF25 /* fmo.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = fmo.h; sourceTree = "<group>"; usesTabs = 1; };
4CE4465318BC5EAA0017DF25 /* get_intra_predictor.h */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.c.h; path = get_intra_predictor.h; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
4CE4465418BC5EAA0017DF25 /* manage_dec_ref.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = manage_dec_ref.h; sourceTree = "<group>"; };
4CE4465518BC5EAA0017DF25 /* mb_cache.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mb_cache.h; sourceTree = "<group>"; };
4CE4465618BC5EAA0017DF25 /* mc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mc.h; sourceTree = "<group>"; };
@ -102,19 +105,19 @@
4CE4466318BC5EAA0017DF25 /* vlc_decoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = vlc_decoder.h; sourceTree = "<group>"; };
4CE4466418BC5EAA0017DF25 /* wels_common_basis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_common_basis.h; sourceTree = "<group>"; };
4CE4466518BC5EAA0017DF25 /* wels_const.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_const.h; sourceTree = "<group>"; };
4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; };
4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; };
4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; };
4CE4466718BC5EAA0017DF25 /* au_parser.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = au_parser.cpp; sourceTree = "<group>"; usesTabs = 0; };
4CE4466818BC5EAA0017DF25 /* bit_stream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bit_stream.cpp; sourceTree = "<group>"; usesTabs = 0; };
4CE4466918BC5EAA0017DF25 /* deblocking.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = deblocking.cpp; sourceTree = "<group>"; tabWidth = 2; };
4CE4466A18BC5EAA0017DF25 /* decode_mb_aux.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_mb_aux.cpp; sourceTree = "<group>"; };
4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; };
4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; };
4CE4466B18BC5EAA0017DF25 /* decode_slice.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decode_slice.cpp; sourceTree = "<group>"; usesTabs = 0; };
4CE4466C18BC5EAA0017DF25 /* decoder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder.cpp; sourceTree = "<group>"; tabWidth = 4; usesTabs = 0; };
4CE4466D18BC5EAA0017DF25 /* decoder_core.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_core.cpp; sourceTree = "<group>"; };
4CE4466E18BC5EAA0017DF25 /* decoder_data_tables.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = decoder_data_tables.cpp; sourceTree = "<group>"; };
4CE4466F18BC5EAA0017DF25 /* expand_pic.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = expand_pic.cpp; sourceTree = "<group>"; };
4CE4467018BC5EAA0017DF25 /* fmo.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fmo.cpp; sourceTree = "<group>"; };
4CE4467118BC5EAA0017DF25 /* get_intra_predictor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = get_intra_predictor.cpp; sourceTree = "<group>"; };
4CE4467218BC5EAA0017DF25 /* manage_dec_ref.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = manage_dec_ref.cpp; sourceTree = "<group>"; };
4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; };
4CE4467318BC5EAA0017DF25 /* mc.cpp */ = {isa = PBXFileReference; fileEncoding = 4; indentWidth = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mc.cpp; sourceTree = "<group>"; tabWidth = 1; usesTabs = 0; wrapsLines = 1; };
4CE4467418BC5EAA0017DF25 /* mem_align.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mem_align.cpp; sourceTree = "<group>"; };
4CE4467518BC5EAA0017DF25 /* memmgr_nal_unit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = memmgr_nal_unit.cpp; sourceTree = "<group>"; };
4CE4467618BC5EAA0017DF25 /* mv_pred.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mv_pred.cpp; sourceTree = "<group>"; };
@ -127,6 +130,9 @@
4CE4468318BC5EAB0017DF25 /* wels_dec_export.def */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = wels_dec_export.def; sourceTree = "<group>"; };
4CE4468418BC5EAB0017DF25 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsCodecTrace.cpp; sourceTree = "<group>"; };
4CE4468518BC5EAB0017DF25 /* welsDecoderExt.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = welsDecoderExt.cpp; sourceTree = "<group>"; };
4CE447A718BC6BE90017DF25 /* block_add_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = block_add_neon.S; sourceTree = "<group>"; };
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = intra_pred_neon.S; sourceTree = "<group>"; };
4CE447AA18BC6BE90017DF25 /* mc_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mc_neon.S; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -212,6 +218,7 @@
4CE4463F18BC5EAA0017DF25 /* core */ = {
isa = PBXGroup;
children = (
4CE447A518BC6BE90017DF25 /* arm */,
4CE4464418BC5EAA0017DF25 /* inc */,
4CE4466618BC5EAA0017DF25 /* src */,
);
@ -313,6 +320,16 @@
path = src;
sourceTree = "<group>";
};
4CE447A518BC6BE90017DF25 /* arm */ = {
isa = PBXGroup;
children = (
4CE447A718BC6BE90017DF25 /* block_add_neon.S */,
4CE447A918BC6BE90017DF25 /* intra_pred_neon.S */,
4CE447AA18BC6BE90017DF25 /* mc_neon.S */,
);
path = arm;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
@ -394,6 +411,7 @@
isa = PBXSourcesBuildPhase;
buildActionMask = 2147483647;
files = (
4CE447AF18BC6BE90017DF25 /* mc_neon.S in Sources */,
4CE4469B18BC5EAB0017DF25 /* pic_queue.cpp in Sources */,
4CE4469F18BC5EAB0017DF25 /* welsDecoderExt.cpp in Sources */,
4CE4469318BC5EAB0017DF25 /* fmo.cpp in Sources */,
@ -404,9 +422,11 @@
4CE4468A18BC5EAB0017DF25 /* au_parser.cpp in Sources */,
4CE4469218BC5EAB0017DF25 /* expand_pic.cpp in Sources */,
4CE4469918BC5EAB0017DF25 /* mv_pred.cpp in Sources */,
4CE447AC18BC6BE90017DF25 /* block_add_neon.S in Sources */,
4CE4469418BC5EAB0017DF25 /* get_intra_predictor.cpp in Sources */,
4CE4469018BC5EAB0017DF25 /* decoder_core.cpp in Sources */,
4CE4469E18BC5EAB0017DF25 /* welsCodecTrace.cpp in Sources */,
4CE447AE18BC6BE90017DF25 /* intra_pred_neon.S in Sources */,
4CE4469618BC5EAB0017DF25 /* mc.cpp in Sources */,
4CE4469C18BC5EAB0017DF25 /* rec_mb.cpp in Sources */,
4CE4468B18BC5EAB0017DF25 /* bit_stream.cpp in Sources */,

View File

@ -0,0 +1,55 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef APPLE_IOS
.macro WELS_ASM_FUNC_BEGIN
.align 2
.arm
.globl _$0
_$0:
.endm
#else
.macro WELS_ASM_FUNC_BEGIN funcName
.align 2
.arm
.global \funcName
\funcName:
.endm
#endif
.macro WELS_ASM_FUNC_END
mov pc, lr
.endm

View File

@ -38,7 +38,12 @@
*************************************************************************************
*/
#include <string.h>
#ifdef ANDROID_NDK
#include <cpu-features.h>
#endif
#ifdef APPLE_IOS
#include <sys/utsname.h>
#endif
#include "cpu.h"
#include "cpu_core.h"
@ -209,4 +214,50 @@ void WelsXmmRegEmptyOp(void * pSrc) {
#endif
#if defined(HAVE_NEON)//For supporting both android platform and iOS platform
#if defined(ANDROID_NDK)
uint32_t WelsCPUFeatureDetectAndroid()
{
uint32_t uiCPU = 0;
AndroidCpuFamily cpuFamily = ANDROID_CPU_FAMILY_UNKNOWN;
uint64_t uiFeatures = 0;
cpuFamily = android_getCpuFamily();
if (cpuFamily == ANDROID_CPU_FAMILY_ARM) {
uiFeatures = android_getCpuFeatures();
if (uiFeatures & ANDROID_CPU_ARM_FEATURE_ARMv7){
uiCPU |= WELS_CPU_ARMv7;
}
if (uiFeatures & ANDROID_CPU_ARM_FEATURE_VFPv3){
uiCPU |= WELS_CPU_VFPv3;
}
if (uiFeatures & ANDROID_CPU_ARM_FEATURE_NEON){
uiCPU |= WELS_CPU_NEON;
}
}
return uiCPU;
}
#endif
#if defined(APPLE_IOS)
uint32_t WelsCPUFeatureDetectIOS() //Need to be updated for the new device of APPLE
{
uint32_t uiCPU = 0;
struct utsname sSystemInfo;
uname (&sSystemInfo);
if ((0 != strcmp(sSystemInfo.machine, "iPhone1,1")) && //iPhone 2G
(0 != strcmp(sSystemInfo.machine, "iPhone1,2")) && //iPhone 3G
(0 != strcmp(sSystemInfo.machine, "iPod1,1")) && //iPod 1G
(0 != strcmp(sSystemInfo.machine, "iPod2,1"))) //iPod 2G
{
uiCPU |= WELS_CPU_ARMv7;
uiCPU |= WELS_CPU_VFPv3;
uiCPU |= WELS_CPU_NEON;
}
return uiCPU;
}
#endif
#endif

View File

@ -80,6 +80,16 @@ void WelsXmmRegLoad(void * src);
void WelsXmmRegEmptyOp(void * pSrc);
#if defined(HAVE_NEON)
#if defined(ANDROID_NDK)
uint32_t WelsCPUFeatureDetectAndroid();
#endif
#if defined(APPLE_IOS)
uint32_t WelsCPUFeatureDetectIOS();
#endif
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus

View File

@ -73,6 +73,11 @@
#define WELS_CPU_CACHELINE_64 0x40000000 /* CacheLine Size 64 */
#define WELS_CPU_CACHELINE_128 0x80000000 /* CacheLine Size 128 */
/* For the android OS */
#define WELS_CPU_ARMv7 0x000001 /* ARMv7 */
#define WELS_CPU_VFPv3 0x000002 /* VFPv3 */
#define WELS_CPU_NEON 0x000004 /* NEON */
/*
* Interfaces for CPU core feature detection as below
*/

View File

@ -33,7 +33,23 @@ void DeblockChromaEq4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride,
void DeblockChromaLt4H_ssse3 (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
int8_t* pTC);
#endif
#if defined(HAVE_NEON)
void DeblockLumaLt4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
void DeblockLumaEq4V_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockLumaLt4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc);
void DeblockLumaEq4H_neon(uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockChromaLt4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
void DeblockChromaEq4V_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
void DeblockChromaLt4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTC);
void DeblockChromaEq4H_neon(uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus
#endif //WELS_DEBLOCKING_COMMON_H__

812
codec/common/deblocking_neon.S Executable file
View File

@ -0,0 +1,812 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
.macro JMP_IF_128BITS_IS_ZERO
vorr.s16 $2, $0, $1
vmov r3, r2, $2
orr r3, r3, r2
cmp r3, #0
.endm
.macro MASK_MATRIX
vabd.u8 $6, $1, $2
vcgt.u8 $6, $4, $6
vabd.u8 $4, $0, $1
vclt.u8 $4, $4, $5
vand.u8 $6, $6, $4
vabd.u8 $4, $3, $2
vclt.u8 $4, $4, $5
vand.u8 $6, $6, $4
.endm
.macro DIFF_LUMA_LT4_P1_Q1
vabd.u8 $9, $0, $2
vclt.u8 $9, $9, $4
vrhadd.u8 $8, $2, $3
vhadd.u8 $8, $0, $8
vsub.s8 $8, $8, $1
vmax.s8 $8, $8, $5
vmin.s8 $8, $8, $6
vand.s8 $8, $8, $9
vand.s8 $8, $8, $7
vadd.u8 $8, $1, $8
vabs.s8 $9, $9
.endm
.macro DIFF_LUMA_LT4_P0_Q0
vsubl.u8 $5, $0, $3
vsubl.u8 $6, $2, $1
vshl.s16 $6, $6, #2
vadd.s16 $5, $5, $6
vrshrn.s16 $4, $5, #3
.endm
.macro DIFF_LUMA_EQ4_P2P1P0
vaddl.u8 q4, $1, $2
vaddl.u8 q5, $3, $4
vadd.u16 q5, q4, q5
vaddl.u8 q4, $0, $1
vshl.u16 q4, q4, #1
vadd.u16 q4, q5, q4
vrshrn.u16 $0, q5, #2
vrshrn.u16 $7, q4, #3
vshl.u16 q5, q5, #1
vsubl.u8 q4, $5, $1
vadd.u16 q5, q4,q5
vaddl.u8 q4, $2, $5
vaddw.u8 q4, q4, $2
vaddw.u8 q4, q4, $3
vrshrn.u16 d10,q5, #3
vrshrn.u16 d8, q4, #2
vbsl.u8 $6, d10, d8
.endm
.macro DIFF_LUMA_EQ4_MASK
vmov $3, $2
vbsl.u8 $3, $0, $1
.endm
.macro DIFF_CHROMA_EQ4_P0Q0
vaddl.u8 $4, $0, $3
vaddw.u8 $5, $4, $1
vaddw.u8 $6, $4, $2
vaddw.u8 $5, $5, $0
vaddw.u8 $6, $6, $3
vrshrn.u16 $7, $5, #2
vrshrn.u16 $8, $6, #2
.endm
.macro LORD_CHROMA_DATA_4
vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
.macro STORE_CHROMA_DATA_4
vst4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r0], r2
vst4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r1], r2
.endm
.macro LORD_LUMA_DATA_3
vld3.u8 {$0[$6],$1[$6],$2[$6]}, [r2], r1
vld3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
.macro STORE_LUMA_DATA_4
vst4.u8 {$0[$4],$1[$4],$2[$4],$3[$4]}, [r0], r1
vst4.u8 {$0[$5],$1[$5],$2[$5],$3[$5]}, [r2], r1
.endm
.macro LORD_LUMA_DATA_4
vld4.u8 {$0[$8],$1[$8],$2[$8],$3[$8]}, [r3], r1
vld4.u8 {$4[$8],$5[$8],$6[$8],$7[$8]}, [r0], r1
.endm
.macro STORE_LUMA_DATA_3
vst3.u8 {$0[$6],$1[$6],$2[$6]}, [r3], r1
vst3.u8 {$3[$6],$4[$6],$5[$6]}, [r0], r1
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART
vcge.s8 $1, $0, #0
vand $1, $0, $1
vsub.s8 $0, $1, $0
.endm
#else
.macro JMP_IF_128BITS_IS_ZERO arg0, arg1, arg2
vorr.s16 \arg2, \arg0, \arg1
vmov r3, r2, \arg2
orr r3, r3, r2
cmp r3, #0
.endm
.macro MASK_MATRIX arg0, arg1, arg2, arg3, arg4, arg5, arg6
vabd.u8 \arg6, \arg1, \arg2
vcgt.u8 \arg6, \arg4, \arg6
vabd.u8 \arg4, \arg0, \arg1
vclt.u8 \arg4, \arg4, \arg5
vand.u8 \arg6, \arg6, \arg4
vabd.u8 \arg4, \arg3, \arg2
vclt.u8 \arg4, \arg4, \arg5
vand.u8 \arg6, \arg6, \arg4
.endm
.macro DIFF_LUMA_LT4_P1_Q1 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
vabd.u8 \arg9, \arg0, \arg2
vclt.u8 \arg9, \arg9, \arg4
vrhadd.u8 \arg8, \arg2, \arg3
vhadd.u8 \arg8, \arg0, \arg8
vsub.s8 \arg8, \arg8, \arg1
vmax.s8 \arg8, \arg8, \arg5
vmin.s8 \arg8, \arg8, \arg6
vand.s8 \arg8, \arg8, \arg9
vand.s8 \arg8, \arg8, \arg7
vadd.u8 \arg8, \arg1, \arg8
vabs.s8 \arg9, \arg9
.endm
.macro DIFF_LUMA_LT4_P0_Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6
vsubl.u8 \arg5, \arg0, \arg3
vsubl.u8 \arg6, \arg2, \arg1
vshl.s16 \arg6, \arg6, #2
vadd.s16 \arg5, \arg5, \arg6
vrshrn.s16 \arg4, \arg5, #3
.endm
.macro DIFF_LUMA_EQ4_P2P1P0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
vaddl.u8 q4, \arg1, \arg2
vaddl.u8 q5, \arg3, \arg4
vadd.u16 q5, q4, q5
vaddl.u8 q4, \arg0, \arg1
vshl.u16 q4, q4, #1
vadd.u16 q4, q5, q4
vrshrn.u16 \arg0, q5, #2
vrshrn.u16 \arg7, q4, #3
vshl.u16 q5, q5, #1
vsubl.u8 q4, \arg5, \arg1
vadd.u16 q5, q4,q5
vaddl.u8 q4, \arg2, \arg5
vaddw.u8 q4, q4, \arg2
vaddw.u8 q4, q4, \arg3
vrshrn.u16 d10,q5, #3
vrshrn.u16 d8, q4, #2
vbsl.u8 \arg6, d10, d8
.endm
.macro DIFF_LUMA_EQ4_MASK arg0, arg1, arg2, arg3
vmov \arg3, \arg2
vbsl.u8 \arg3, \arg0, \arg1
.endm
.macro DIFF_CHROMA_EQ4_P0Q0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
vaddl.u8 \arg4, \arg0, \arg3
vaddw.u8 \arg5, \arg4, \arg1
vaddw.u8 \arg6, \arg4, \arg2
vaddw.u8 \arg5, \arg5, \arg0
vaddw.u8 \arg6, \arg6, \arg3
vrshrn.u16 \arg7, \arg5, #2
vrshrn.u16 \arg8, \arg6, #2
.endm
.macro LORD_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
.endm
.macro STORE_CHROMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
vst4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r0], r2
vst4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r1], r2
.endm
.macro LORD_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
vld3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r2], r1
vld3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
.endm
.macro STORE_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5
vst4.u8 {\arg0[\arg4],\arg1[\arg4],\arg2[\arg4],\arg3[\arg4]}, [r0], r1
vst4.u8 {\arg0[\arg5],\arg1[\arg5],\arg2[\arg5],\arg3[\arg5]}, [r2], r1
.endm
.macro LORD_LUMA_DATA_4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
vld4.u8 {\arg0[\arg8],\arg1[\arg8],\arg2[\arg8],\arg3[\arg8]}, [r3], r1
vld4.u8 {\arg4[\arg8],\arg5[\arg8],\arg6[\arg8],\arg7[\arg8]}, [r0], r1
.endm
.macro STORE_LUMA_DATA_3 arg0, arg1, arg2, arg3, arg4, arg5, arg6
vst3.u8 {\arg0[\arg6],\arg1[\arg6],\arg2[\arg6]}, [r3], r1
vst3.u8 {\arg3[\arg6],\arg4[\arg6],\arg5[\arg6]}, [r0], r1
.endm
.macro EXTRACT_DELTA_INTO_TWO_PART arg0, arg1
vcge.s8 \arg1, \arg0, #0
vand \arg1, \arg0, \arg1
vsub.s8 \arg0, \arg1, \arg0
.endm
#endif
WELS_ASM_FUNC_BEGIN DeblockLumaLt4V_neon
vdup.u8 q11, r2
vdup.u8 q9, r3
add r2, r1, r1, lsl #1
sub r2, r0, r2
vld1.u8 {q0}, [r2], r1
vld1.u8 {q3}, [r0], r1
vld1.u8 {q1}, [r2], r1
vld1.u8 {q4}, [r0], r1
vld1.u8 {q2}, [r2]
vld1.u8 {q5}, [r0]
sub r2, r2, r1
ldr r3, [sp, #0]
vld1.s8 {d31}, [r3]
vdup.s8 d28, d31[0]
vdup.s8 d30, d31[1]
vdup.s8 d29, d31[2]
vdup.s8 d31, d31[3]
vtrn.32 d28, d30
vtrn.32 d29, d31
vcge.s8 q10, q14, #0
MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
vand.u8 q10, q10, q15
veor q15, q15
vsub.i8 q15,q15,q14
DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
vst1.u8 {q6}, [r2], r1
DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
vabs.s8 q12, q12
vabs.s8 q13, q13
vadd.u8 q14,q14,q12
vadd.u8 q14,q14,q13
veor q15, q15
vsub.i8 q15,q15,q14
DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
vmax.s8 q8, q8, q15
vmin.s8 q8, q8, q14
vand.s8 q8, q8, q10
EXTRACT_DELTA_INTO_TWO_PART q8, q9
vqadd.u8 q2, q2, q9
vqsub.u8 q2, q2, q8
vst1.u8 {q2}, [r2], r1
vqsub.u8 q3, q3, q9
vqadd.u8 q3, q3, q8
vst1.u8 {q3}, [r2] , r1
vst1.u8 {q7}, [r2]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaEq4V_neon
vdup.u8 q5, r2
vdup.u8 q4, r3
sub r3, r0, r1, lsl #2
vld1.u8 {q8}, [r3], r1
vld1.u8 {q12}, [r0], r1
vld1.u8 {q9}, [r3], r1
vld1.u8 {q13}, [r0], r1
vld1.u8 {q10}, [r3], r1
vld1.u8 {q14}, [r0], r1
vld1.u8 {q11}, [r3]
vld1.u8 {q15}, [r0]
sub r3, r3, r1 , lsl #1
MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
mov r2, r2, lsr #2
add r2, r2, #2
vdup.u8 q5, r2
vabd.u8 q0, q11, q12
vclt.u8 q7, q0, q5
vabd.u8 q1, q9, q11
vclt.u8 q1, q1, q4
vand.s8 q1, q1, q7
vabd.u8 q2, q14,q12
vclt.u8 q2, q2, q4
vand.s8 q2, q2, q7
vand.u8 q7, q7, q6
vmov q3, q1
DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
vand.u8 q3, q7, q3
DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
vst1.u8 {q4}, [r3], r1
DIFF_LUMA_EQ4_MASK q8,q10, q3, q4
vst1.u8 {q4}, [r3], r1
DIFF_LUMA_EQ4_MASK q1,q11, q6, q4
vst1.u8 {q4}, [r3], r1
vmov q0, q2
DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d6
DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d7
vand.u8 q0, q7, q0
DIFF_LUMA_EQ4_MASK q2, q12, q6, q4
vst1.u8 {q4}, [r3], r1
DIFF_LUMA_EQ4_MASK q15, q13, q0, q4
vst1.u8 {q4}, [r3], r1
DIFF_LUMA_EQ4_MASK q3, q14, q0, q4
vst1.u8 {q4}, [r3], r1
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaLt4H_neon
vdup.u8 q11, r2
vdup.u8 q9, r3
sub r2, r0, #3
LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 0
LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 1
LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 2
LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 3
LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 4
LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 5
LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 6
LORD_LUMA_DATA_3 d0, d1, d2, d6, d7, d8, 7
LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 0
LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 1
LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 2
LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 3
LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 4
LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 5
LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 6
LORD_LUMA_DATA_3 d3, d4, d5, d9, d10, d11, 7
vswp d1, d2
vswp d3, d4
vswp d1, d4
vswp d7, d8
vswp d9, d10
vswp d7, d10
sub r0, r0, r1, lsl #4
ldr r3, [sp, #0]
vld1.s8 {d31}, [r3]
vdup.s8 d28, d31[0]
vdup.s8 d30, d31[1]
vdup.s8 d29, d31[2]
vdup.s8 d31, d31[3]
vtrn.32 d28, d30
vtrn.32 d29, d31
vcge.s8 q10, q14, #0
MASK_MATRIX q1, q2, q3, q4, q11, q9, q15
vand.u8 q10, q10, q15
veor q15, q15
vsub.i8 q15,q15,q14
DIFF_LUMA_LT4_P1_Q1 q0, q1, q2, q3, q9, q15, q14, q10, q6, q12
DIFF_LUMA_LT4_P1_Q1 q5, q4, q3, q2, q9, q15, q14, q10, q7, q13
vabs.s8 q12, q12
vabs.s8 q13, q13
vadd.u8 q14,q14,q12
vadd.u8 q14,q14,q13
veor q15, q15
vsub.i8 q15,q15,q14
DIFF_LUMA_LT4_P0_Q0 d2, d4, d6, d8, d16, q12, q13
DIFF_LUMA_LT4_P0_Q0 d3, d5, d7, d9, d17, q12, q13
vmax.s8 q8, q8, q15
vmin.s8 q8, q8, q14
vand.s8 q8, q8, q10
EXTRACT_DELTA_INTO_TWO_PART q8, q9
vqadd.u8 q2, q2, q9
vqsub.u8 q2, q2, q8
vqsub.u8 q3, q3, q9
vqadd.u8 q3, q3, q8
sub r0, #2
add r2, r0, r1
lsl r1, #1
vmov q1, q6
vmov q4, q7
vswp q2, q3
vswp d3, d6
vswp d5, d8
STORE_LUMA_DATA_4 d2, d3, d4, d5, 0, 1
STORE_LUMA_DATA_4 d2, d3, d4, d5, 2, 3
STORE_LUMA_DATA_4 d2, d3, d4, d5, 4, 5
STORE_LUMA_DATA_4 d2, d3, d4, d5, 6, 7
STORE_LUMA_DATA_4 d6, d7, d8, d9, 0, 1
STORE_LUMA_DATA_4 d6, d7, d8, d9, 2, 3
STORE_LUMA_DATA_4 d6, d7, d8, d9, 4, 5
STORE_LUMA_DATA_4 d6, d7, d8, d9, 6, 7
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockLumaEq4H_neon
vdup.u8 q5, r2
vdup.u8 q4, r3
sub r3, r0, #4 // pix -= 4
LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,0
LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,1
LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,2
LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,3
LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,4
LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,5
LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,6
LORD_LUMA_DATA_4 d16,d17,d18,d19,d24,d25,d26,d27,7
LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,0
LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,1
LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,2
LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,3
LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,4
LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,5
LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,6
LORD_LUMA_DATA_4 d20,d21,d22,d23,d28,d29,d30,d31,7
vswp q9, q10
vswp d17,d18
vswp d21,d22
vswp q13,q14
vswp d25,d26
vswp d29,d30
sub r0, r0, r1 , lsl #4
MASK_MATRIX q10, q11, q12, q13, q5, q4, q6
mov r2, r2, lsr #2
add r2, r2, #2
vdup.u8 q5, r2
vabd.u8 q0, q11, q12
vclt.u8 q7, q0, q5
vabd.u8 q1, q9, q11
vclt.u8 q1, q1, q4
vand.s8 q1, q1, q7
vabd.u8 q2, q14,q12
vclt.u8 q2, q2, q4
vand.s8 q2, q2, q7
vand.u8 q7, q7, q6
vmov q3, q1
DIFF_LUMA_EQ4_P2P1P0 d16, d18, d20, d22, d24, d26, d2, d0
DIFF_LUMA_EQ4_P2P1P0 d17, d19, d21, d23, d25, d27, d3, d1
vand.u8 q3, q7, q3
DIFF_LUMA_EQ4_MASK q0, q9, q3, q4
vmov q9, q4
vbsl.u8 q3, q8, q10
DIFF_LUMA_EQ4_MASK q1,q11, q6, q8
vand.u8 q7, q7, q2
DIFF_LUMA_EQ4_P2P1P0 d30, d28, d26, d24, d22, d20, d4, d0
DIFF_LUMA_EQ4_P2P1P0 d31, d29, d27, d25, d23, d21, d5, d1
vbsl.u8 q6, q2, q12
DIFF_LUMA_EQ4_MASK q15, q13, q7, q4
vbsl.u8 q7, q0, q14
vmov q5, q6
vmov q2, q9
vmov q6, q4
vmov q4, q8
vswp d8, d6
vswp d5, d7
vswp d5, d8
vswp d14, d12
vswp d11, d13
vswp d11, d14
sub r3, r0, #3
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,0
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,1
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,2
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,3
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,4
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,5
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,6
STORE_LUMA_DATA_3 d4,d5,d6,d10,d11,d12,7
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,0
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,1
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,2
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,3
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,4
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,5
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,6
STORE_LUMA_DATA_3 d7,d8,d9,d13,d14,d15,7
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaLt4V_neon
vdup.u8 q11, r3
ldr r3, [sp, #0]
sub r0, r0, r2 , lsl #1
sub r1, r1, r2, lsl #1
vdup.u8 q9, r3
ldr r3, [sp, #4]
vld1.u8 {d0}, [r0], r2
vld1.u8 {d1}, [r1], r2
vld1.u8 {d2}, [r0], r2
vld1.u8 {d3}, [r1], r2
vld1.u8 {d4}, [r0], r2
vld1.u8 {d5}, [r1], r2
vld1.u8 {d6}, [r0]
vld1.u8 {d7}, [r1]
sub r0, r0, r2, lsl #1
sub r1, r1, r2, lsl #1
vld1.s8 {d15}, [r3]
vmovl.u8 q6, d15
vshl.u64 d13,d12,#8
vorr d12,d13
vmov d13, d12
veor q7, q7
vsub.i8 q7,q7,q6
MASK_MATRIX q0, q1, q2, q3, q11, q9, q5
DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d8, q12, q13
DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d9, q12, q13
vmax.s8 q4, q4, q7
vmin.s8 q4, q4, q6
vand.s8 q4, q4, q5
vcge.s8 q6, q6, #0
vand.s8 q4, q4, q6
EXTRACT_DELTA_INTO_TWO_PART q4, q5
vqadd.u8 q1, q1, q5
vqsub.u8 q1, q1, q4
vst1.u8 {d2}, [r0], r2
vst1.u8 {d3}, [r1], r2
vqsub.u8 q2, q2, q5
vqadd.u8 q2, q2, q4
vst1.u8 {d4}, [r0]
vst1.u8 {d5}, [r1]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaEq4V_neon
vdup.u8 q11, r3
ldr r3, [sp, #0]
sub r0, r0, r2 , lsl #1
sub r1, r1, r2, lsl #1
vdup.u8 q9, r3
vld1.u8 {d0}, [r0], r2 // q0::p1
vld1.u8 {d1}, [r1], r2
vld1.u8 {d2}, [r0], r2 // q1::p0
vld1.u8 {d3}, [r1], r2
vld1.u8 {d4}, [r0], r2 // q2::q0
vld1.u8 {d5}, [r1], r2
vld1.u8 {d6}, [r0] // q3::q1
vld1.u8 {d7}, [r1]
sub r0, r0, r2, lsl #1 // pix = [-1*src_stride]
sub r1, r1, r2, lsl #1
MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
vmov q11, q10
DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q4, q5, q6, d14, d0 // Cb::p0' q0'
DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q12, q13, q14, d15, d1 // Cr::p0' q0'
vbsl.u8 q10, q7, q1
vst1.u8 {d20}, [r0], r2
vst1.u8 {d21}, [r1], r2
vbsl.u8 q11, q0, q2
vst1.u8 {d22}, [r0]
vst1.u8 {d23}, [r1]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaLt4H_neon
vdup.u8 q11, r3
ldr r3, [sp, #0]
sub r0, r0, #2
vdup.u8 q9, r3
ldr r3, [sp, #4]
sub r1, r1, #2
vld1.s8 {d15}, [r3]
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
vswp q1, q2
vswp d1, d2
vswp d6, d5
vmovl.u8 q6, d15
vshl.u64 d13,d12,#8
vorr d12,d13
vmov d13, d12
veor q7, q7
vsub.i8 q7,q7,q6
MASK_MATRIX q0, q1, q2, q3, q11, q9, q5
DIFF_LUMA_LT4_P0_Q0 d0, d2, d4, d6, d8, q12, q13
DIFF_LUMA_LT4_P0_Q0 d1, d3, d5, d7, d9, q12, q13
vmax.s8 q4, q4, q7
vmin.s8 q4, q4, q6
vand.s8 q4, q4, q5
vcge.s8 q6, q6, #0
vand.s8 q4, q4, q6
EXTRACT_DELTA_INTO_TWO_PART q4, q5
vqadd.u8 q1, q1, q5
vqsub.u8 q1, q1, q4
vqsub.u8 q2, q2, q5
vqadd.u8 q2, q2, q4
sub r0, r0, r2, lsl #3
sub r1, r1, r2, lsl #3
vswp d1, d2
vswp d6, d5
vswp q1, q2
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DeblockChromaEq4H_neon
vdup.u8 q11, r3
ldr r3, [sp, #0]
sub r0, r0, #2
sub r1, r1, #2
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
LORD_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
vswp q1, q2
vswp d1, d2
vswp d6, d5
vdup.u8 q9, r3
MASK_MATRIX q0, q1, q2, q3, q11, q9, q10
vmov q11, q10
DIFF_CHROMA_EQ4_P0Q0 d0, d2, d4, d6, q8, q9, q12, d8, d10
DIFF_CHROMA_EQ4_P0Q0 d1, d3, d5, d7, q13, q14, q15, d9, d11
vbsl.u8 q10, q4, q1
vbsl.u8 q11, q5, q2
sub r0, r0, r2, lsl #3 // pix: 0th row [-2]
sub r1, r1, r2, lsl #3
vmov q1, q10
vmov q2, q11
vswp d1, d2
vswp d6, d5
vswp q1, q2
// Cb:d0d1d2d3, Cr:d4d5d6d7
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 0
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 1
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 2
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 3
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 4
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 5
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 6
STORE_CHROMA_DATA_4 d0, d1, d2, d3, d4, d5, d6, d7, 7
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN enc_avc_non_zero_count_neon
vld1.64 {d0-d2}, [r0]
vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0
vmvn q0, q0
vmvn d2, d2
vabs.s8 q0, q0
vabs.s8 d2, d2
vst1.64 {d0-d2}, [r0]
WELS_ASM_FUNC_END
#endif

View File

@ -39,6 +39,51 @@
extern "C" {
#endif//__cplusplus
#if defined(HAVE_NEON)
void McCopyWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McCopyWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McCopyWidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McChromaWidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
void McChromaWidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t* pWeights, int32_t iHeight);
void PixelAvgWidthEq16_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
void PixelAvgWidthEq8_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
void PixelAvgWidthEq4_neon(uint8_t* pDst, int32_t iDstStride, uint8_t* pSrcA, uint8_t* pSrcB, int32_t iHeight);
void McHorVer01WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer01WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer01WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer03WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer03WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer03WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer10WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer10WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer10WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer30WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer30WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer30WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
//horizontal filter to gain half sample, that is (2, 0) location in quarter sample
void McHorVer20WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer20WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer20WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
//vertical filter to gain half sample, that is (0, 2) location in quarter sample
void McHorVer02WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer02WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer02WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
//horizontal and vertical filter to gain half sample, that is (2, 2) location in quarter sample
void McHorVer22WidthEq16_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer22WidthEq8_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
void McHorVer22WidthEq4_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iHeight);
#endif
#if defined(X86_ASM)
//***************************************************************************//
// MMXEXT definition //

View File

@ -0,0 +1,203 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
.macro ROW_TRANSFORM_1_STEP
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 $8, $1, #1
vshr.s16 $9, $3, #1
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_1_STEP
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 $6, $1, #1
vshr.s32 $7, $3, #1
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#else
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
vshr.s16 \arg8, \arg1, #1
vshr.s16 \arg9, \arg3, #1
vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
// }
.endm
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
// { // output: f_q[0]~[3], input: e_q[0]~[3];
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
// }
.endm
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
// { // input: src_q[0]~[3], output: e_q[0]~[3];
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
vshr.s32 \arg6, \arg1, #1
vshr.s32 \arg7, \arg3, #1
vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
// }
.endm
#endif
// r0 int16_t* block,
// r1 int8_t* non_zero_count,
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
vld1.64 {d0-d2}, [r1]
vceq.s8 q0, q0, #0
vceq.s8 d2, d2, #0
vmvn q0, q0
vmvn d2, d2
vabs.s8 q0, q0
vabs.s8 d2, d2
vst1.64 {d0-d2}, [r1]
WELS_ASM_FUNC_END
// r0 int16_t * block,
// r1 int32_t stride
WELS_ASM_FUNC_BEGIN WelsResBlockZero16x16_neon// can use for 256*sizeof(int16_t)
push {r2}
mov r2, #16
// each row 16 elements, 16*sizeof(int16_t)
// memset(ptr_dest, 0, 16*sizeof(int16_t));
// ptr_dest += stride;
lsl r1, r1, #1 // r1 = 2*r1
veor.i16 q0, q0, q0
veor.i16 q1, q1, q1
block_zero_16x16_luma_loop:
vst1.i16 {q0, q1}, [r0], r1
subs r2, r2, #2
vst1.i16 {q0, q1}, [r0], r1
bne block_zero_16x16_luma_loop
pop {r2}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsResBlockZero8x8_neon// can use for 64*sizeof(int16_t)
push {r2}
mov r2, #8
// each row 8 elements, 8*sizeof(int16_t)
// memset(ptr_dest, 0, 8*sizeof(int16_t));
// ptr_dest += stride;
lsl r1, r1, #1
veor.i16 q0, q0, q0
block_zero_8x8_chma_loop:
vst1.i16 {q0}, [r0], r1
subs r2, r2, #2
vst1.i16 {q0}, [r0], r1
bne block_zero_8x8_chma_loop
pop {r2}
WELS_ASM_FUNC_END
// uint8_t *pred, const int32_t stride, int16_t *rs
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q4, q5, q6, q7, d4, d5
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
// transform element 32bits
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q4, q5, q6, q7
TRANSFORM_4BYTES q0, q1, q2, q3, q4, q5, q6, q7
//after clip_table[MAX_NEG_CROP] into [0, 255]
mov r2, r0
vld1.32 {d12[0]},[r0],r1
vld1.32 {d12[1]},[r0],r1
vld1.32 {d14[0]},[r0],r1
vld1.32 {d14[1]},[r0]
vrshrn.s32 d8, q0, #6
vrshrn.s32 d9, q1, #6
vrshrn.s32 d10, q2, #6
vrshrn.s32 d11, q3, #6
vmovl.u8 q0,d12
vmovl.u8 q1,d14
vadd.s16 q0,q4
vadd.s16 q1,q5
vqmovun.s16 d12,q0
vqmovun.s16 d14,q1
vst1.32 {d12[0]},[r2],r1
vst1.32 {d12[1]},[r2],r1
vst1.32 {d14[0]},[r2],r1
vst1.32 {d14[1]},[r2]
WELS_ASM_FUNC_END
#endif

View File

@ -0,0 +1,649 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifdef HAVE_NEON
//Global macro
.text
#include "arm_arch_common_macro.S"
#ifdef APPLE_IOS
//Global macro
.macro GET_8BYTE_DATA
vld1.8 {$0[0]}, [$1], $2
vld1.8 {$0[1]}, [$1], $2
vld1.8 {$0[2]}, [$1], $2
vld1.8 {$0[3]}, [$1], $2
vld1.8 {$0[4]}, [$1], $2
vld1.8 {$0[5]}, [$1], $2
vld1.8 {$0[6]}, [$1], $2
vld1.8 {$0[7]}, [$1], $2
.endmacro
#else
//Global macro
.macro GET_8BYTE_DATA arg0, arg1, arg2
vld1.8 {\arg0[0]}, [\arg1], \arg2
vld1.8 {\arg0[1]}, [\arg1], \arg2
vld1.8 {\arg0[2]}, [\arg1], \arg2
vld1.8 {\arg0[3]}, [\arg1], \arg2
vld1.8 {\arg0[4]}, [\arg1], \arg2
vld1.8 {\arg0[5]}, [\arg1], \arg2
vld1.8 {\arg0[6]}, [\arg1], \arg2
vld1.8 {\arg0[7]}, [\arg1], \arg2
.endm
#endif
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
//Get the top line data to 'q0'
sub r2, r0, r1
vldm r2, {d0, d1}
mov r2, r0
mov r3, #4
//Set the top line to the each line of MB(16*16)
loop_0_get_i16x16_luma_pred_v:
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
vst1.8 {d0,d1}, [r2], r1
subs r3, #1
bne loop_0_get_i16x16_luma_pred_v
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
sub r2, r0, #1
mov r3, #4
loop_0_get_i16x16_luma_pred_h:
//Get one byte data from left side
vld1.8 {d0[],d1[]}, [r2], r1
vld1.8 {d2[],d3[]}, [r2], r1
vld1.8 {d4[],d5[]}, [r2], r1
vld1.8 {d6[],d7[]}, [r2], r1
//Set the line of MB using the left side byte data
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d2,d3}, [r0], r1
vst1.8 {d4,d5}, [r0], r1
vst1.8 {d6,d7}, [r0], r1
subs r3, #1
bne loop_0_get_i16x16_luma_pred_h
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
//stmdb sp!, { r2-r5, lr}
//Get the left vertical line data
sub r2, r0, #1
GET_8BYTE_DATA d0, r2, r1
GET_8BYTE_DATA d1, r2, r1
//Get the top horizontal line data
sub r2, r0, r1
vldm r2, {d2, d3}
//Calculate the sum of top horizontal line data and vertical line data
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vadd.u16 q0, q0, q1
vadd.u16 d0, d0, d1
vpaddl.u16 d0, d0
vpaddl.u32 d0, d0
//Calculate the mean value
vrshr.u16 d0, d0, #5
vdup.8 q0, d0[0]
//Set the mean value to the all of member of MB
mov r2, #4
loop_0_get_i16x16_luma_pred_dc_both:
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
vst1.8 {d0,d1}, [r0], r1
subs r2, #1
bne loop_0_get_i16x16_luma_pred_dc_both
WELS_ASM_FUNC_END
//The table for SIMD instruction {(8,7,6,5,4,3,2,1) * 5}
CONST0_GET_I16X16_LUMA_PRED_PLANE: .long 0x191e2328, 0x050a0f14
//The table for SIMD instruction {-7,-6,-5,-4,-3,-2,-1,0}
CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
//stmdb sp!, { r2-r5, lr}
//Load the table {(8,7,6,5,4,3,2,1) * 5}
adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
vldr d0, [r2]
//Pack the top[-1] ~ top[6] to d1
sub r2, r0, r1
sub r3, r2, #1
vld1.8 d1, [r3]
//Pack the top[8] ~ top[15] to d2
add r3, #9
vld1.8 d2, [r3]
//Save the top[15] to d6 for next step
vdup.u8 d6, d2[7]
//Get and pack left[-1] ~ left[6] to d4
sub r3, r2, #1
GET_8BYTE_DATA d4, r3, r1
//Get and pack left[8] ~ left[15] to d3
add r3, r1
GET_8BYTE_DATA d3, r3, r1
//Save the left[15] to d7 for next step
vdup.u8 d7, d3[7]
//revert the sequence of d2,d3
vrev64.8 q1, q1
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
vmovl.u8 q0, d0
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
//Calculate the sum of items of q1, q2
vpadd.s16 d0, d2, d3
vpadd.s16 d1, d4, d5
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
//Get the value of 'b', 'c' and extend to q1, q2.
vrshr.s64 q0, #6
vdup.s16 q1, d0[0]
vdup.s16 q2, d1[0]
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
vld1.32 {d0}, [r2]
//Get the value of 'a' and save to q3
vaddl.u8 q3, d6, d7
vshl.u16 q3, #4
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
vmovl.s8 q0, d0
vmla.s16 q3, q0, q1
vmla.s16 q3, q2, d0[0]
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
vshl.s16 q5, q1, #3
vadd.s16 q5, q3
//right shift 5 bits and rounding
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5
//Set the line of MB
vst1.u32 {d0,d1}, [r0], r1
//Do the same processing for setting other lines
mov r2, #15
loop_0_get_i16x16_luma_pred_plane:
vadd.s16 q3, q2
vadd.s16 q5, q2
vqrshrun.s16 d0, q3, #5
vqrshrun.s16 d1, q5, #5
vst1.u32 {d0,d1}, [r0], r1
subs r2, #1
bne loop_0_get_i16x16_luma_pred_plane
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
ldr r2, [r2]
//Set the luma MB using top line
str r2, [r0], r1
str r2, [r0], r1
str r2, [r0], r1
str r2, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column (4 bytes)
sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2]
//Set the luma MB using the left side byte
vst1.32 {d0[0]}, [r0], r1
vst1.32 {d1[0]}, [r0], r1
vst1.32 {d2[0]}, [r0], r1
vst1.32 {d3[0]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data(8 bytes)
sub r2, r0, r1
vld1.32 {d0}, [r2]
//For "t7 + (t7<<1)"
vdup.8 d1, d0[7]
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
vext.8 d1, d0, d1, #1
vaddl.u8 q1, d1, d0
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
vext.8 q2, q1, q1, #14
vadd.u16 q0, q1, q2
//right shift 2 bits and rounding
vqrshrn.u16 d0, q0, #2
//Save "ddl0, ddl1, ddl2, ddl3"
vext.8 d1, d0, d0, #1
vst1.32 d1[0], [r0], r1
//Save "ddl1, ddl2, ddl3, ddl4"
vext.8 d1, d0, d0, #2
vst1.32 d1[0], [r0], r1
//Save "ddl2, ddl3, ddl4, ddl5"
vext.8 d1, d0, d0, #3
vst1.32 d1[0], [r0], r1
//Save "ddl3, ddl4, ddl5, ddl6"
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
vld1.32 {d0[1]}, [r2]
//Load the left column (5 bytes)
sub r2, #1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2], r1
vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
vaddl.u8 q2, d2, d0
//q1:{TL0+LT0,LT0+T01,...L12+L23}
vext.8 q3, q3, q2, #14
vadd.u16 q1, q2, q3
//right shift 2 bits and rounding
vqrshrn.u16 d0, q1, #2
//Adjust the data sequence for setting luma MB of 'pred'
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0], r1
vext.8 d0, d0, d0, #7
vst1.32 d0[1], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (8 bytes)
sub r2, r0, r1
vld1.32 {d0}, [r2]
vext.8 d1, d0, d0, #1
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
vext.8 q2, q1, q1, #2
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
//calculate the "vl0,vl1,vl2,vl3,vl4"
vqrshrn.u16 d0, q1, #1
//calculate the "vl5,vl6,vl7,vl8,vl9"
vqrshrn.u16 d1, q2, #2
//Adjust the data sequence for setting the luma MB
vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0], r1
vext.8 d0, d0, d0, #1
vext.8 d1, d1, d1, #1
vst1.32 d0[0], [r0], r1
vst1.32 d1[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row (4 bytes)
sub r2, r0, r1
vld1.32 {d0[1]}, [r2]
//Load the left column (4 bytes)
sub r2, #1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2]
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
vext.u8 q2, q1, q1, #14
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
//Calculate the vr0 ~ vr9
vqrshrn.u16 d1, q2, #2
vqrshrn.u16 d0, q1, #1
//Adjust the data sequence for setting the luma MB
vst1.32 d0[1], [r0], r1
vst1.32 d1[1], [r0], r1
add r2, r0, r1
vst1.8 d1[3], [r0]!
vst1.16 d0[2], [r0]!
vst1.8 d0[6], [r0]!
vst1.8 d1[2], [r2]!
vst1.16 d1[2], [r2]!
vst1.8 d1[6], [r2]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column data
sub r2, r0, #1
mov r3, #3
mul r3, r1
add r3, r2
vld1.8 {d0[]}, [r3]
vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
vext.8 d1, d0, d0, #1
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
vext.u8 d2, d5, d4, #2
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
//Calculate the hu0 ~ hu5
vqrshrn.u16 d2, q2, #1
vqrshrn.u16 d1, q1, #2
//Adjust the data sequence for setting the luma MB
vzip.8 d2, d1
vst1.32 d1[0], [r0], r1
vext.8 d2, d1, d1, #2
vst1.32 d2[0], [r0], r1
vst1.32 d1[1], [r0], r1
vst1.32 d0[0], [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
//stmdb sp!, { r2-r5, lr}
//Load the data
sub r2, r0, r1
sub r2, #1
vld1.32 {d0[1]}, [r2], r1
vld1.8 {d0[3]}, [r2], r1
vld1.8 {d0[2]}, [r2], r1
vld1.8 {d0[1]}, [r2], r1
vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
vext.8 d1, d0, d0, #7
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
//Calculate the hd0~hd9
vqrshrn.u16 d1, q3, #2
vqrshrn.u16 d0, q2, #1
//Adjust the data sequence for setting the luma MB
vmov d3, d1
vtrn.8 d0, d1
vext.u8 d2, d1, d1, #6
vst2.16 {d2[3], d3[3]}, [r0], r1
vst2.16 {d0[2], d1[2]}, [r0], r1
vmov d3, d0
vst2.16 {d2[2], d3[2]}, [r0], r1
vst2.16 {d0[1], d1[1]}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
//stmdb sp!, { r2-r5, lr}
//Get the top row (8 byte)
sub r2, r0, r1
vldr d0, [r2]
//Set the chroma MB using top row data
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0], r1
vst1.8 {d0}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
//stmdb sp!, { r2-r5, lr}
////Get the left column (8 byte)
sub r2, r0, #1
vld1.8 {d0[]}, [r2], r1
vld1.8 {d1[]}, [r2], r1
vld1.8 {d2[]}, [r2], r1
vld1.8 {d3[]}, [r2], r1
vld1.8 {d4[]}, [r2], r1
vld1.8 {d5[]}, [r2], r1
vld1.8 {d6[]}, [r2], r1
vld1.8 {d7[]}, [r2]
//Set the chroma MB using left column data
vst1.8 {d0}, [r0], r1
vst1.8 {d1}, [r0], r1
vst1.8 {d2}, [r0], r1
vst1.8 {d3}, [r0], r1
vst1.8 {d4}, [r0], r1
vst1.8 {d5}, [r0], r1
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r0]
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredDC_neon
//stmdb sp!, { r2-r5, lr}
//Load the left column data (8 bytes)
sub r2, r0, #1
GET_8BYTE_DATA d0, r2, r1
//Load the top row data (8 bytes)
sub r2, r0, r1
vldr d1, [r2]
//Calculate the sum of left column and top row
vpaddl.u8 q0, q0
vpaddl.u16 q0, q0
vadd.u32 d2, d0, d1 //'m1' save to d2
vrshr.u32 q0, q0, #2 //calculate 'm2','m3'
vrshr.u32 d2, d2, #3 //calculate 'm4'
//duplicate the 'mx' to a vector line
vdup.8 d4, d2[0]
vdup.8 d5, d1[4]
vdup.8 d6, d0[4]
vdup.8 d7, d2[4]
//Set the chroma MB
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d4[0],d5[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0], r1
vst2.32 {d6[0],d7[0]}, [r0]
WELS_ASM_FUNC_END
//Table {{1,2,3,4,1,2,3,4}*17}
CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x28231e19
//Table {-3,-2,-1,0,1,2,3,4}
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
//stmdb sp!, { r2-r5, lr}
//Load the top row data
sub r2, r0, #1
sub r2, r1
vld1.32 {d1[0]}, [r2]
add r2, #5
vld1.32 {d0[0]}, [r2]
//Load the left column data
sub r2, #5
vld1.8 {d1[4]}, [r2], r1
vld1.8 {d1[5]}, [r2], r1
vld1.8 {d1[6]}, [r2], r1
vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
add r2, r1
vld1.8 {d0[4]}, [r2], r1
vld1.8 {d0[5]}, [r2], r1
vld1.8 {d0[6]}, [r2], r1
vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
//Save T7 to d3 for next step
vdup.u8 d3, d0[3]
//Save L7 to d4 for next step
vdup.u8 d4, d0[7]
//Calculate the value of 'a' and save to q2
vaddl.u8 q2, d3, d4
vshl.u16 q2, #4
//Load the table {{1,2,3,4,1,2,3,4}*17}
adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
vld1.32 {d2}, [r2]
//Calculate the 'b','c', and save to q0
vrev32.8 d1, d1
vsubl.u8 q0, d0, d1
vmovl.u8 q1, d2
vmul.s16 q0, q1
vpaddl.s16 q0, q0
vpaddl.s32 q0, q0
vrshr.s64 q0, #5
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
vld1.32 {d6, d7}, [r2]
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
vdup.s16 q1, d1[0]
vdup.s16 q0, d0[0]
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
vmla.s16 q2, q0, q3
vmla.s16 q2, q1, d6[0]
vqrshrun.s16 d0, q2, #5
//Set a line of chroma MB
vst1.u32 {d0}, [r0], r1
//Do the same processing for each line.
mov r2, #7
loop_0_get_i_chroma_pred_plane:
vadd.s16 q2, q1
vqrshrun.s16 d0, q2, #5
vst1.u32 {d0}, [r0], r1
subs r2, #1
bne loop_0_get_i_chroma_pred_plane
WELS_ASM_FUNC_END
#endif

1602
codec/decoder/core/arm/mc_neon.S Executable file

File diff suppressed because it is too large Load Diff

View File

@ -50,6 +50,10 @@ extern "C" {
void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
#endif//X86_ASM
#if defined(HAVE_NEON)
void IdctResAddPred_neon(uint8_t *pred, const int32_t stride, int16_t *rs);
#endif
#if defined(__cplusplus)
}
#endif//__cplusplus

View File

@ -68,6 +68,12 @@ void WelsChromaDcIdct (int16_t* pBlock);
extern "C" {
#endif//__cplusplus
#if defined(HAVE_NEON)
void WelsResBlockZero16x16_neon(int16_t* pBlock, int32_t iStride);
void WelsResBlockZero8x8_neon(int16_t* pBlock, int32_t iStride);
void SetNonZeroCount_neon(int16_t* pBlock, int8_t* pNonZeroCount);
#endif
#ifdef X86_ASM
void WelsResBlockZero16x16_sse2 (int16_t* pBlock, int32_t iStride);
void WelsResBlockZero8x8_sse2 (int16_t* pBlock, int32_t iStride);

View File

@ -107,6 +107,27 @@ void WelsDecoderI4x4LumaPredDDL_mmx (uint8_t* pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredVL_mmx (uint8_t* pPred, const int32_t kiStride);
#endif//X86_ASM
#if defined(HAVE_NEON)
void WelsDecoderI16x16LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI16x16LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI16x16LumaPredDc_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI16x16LumaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredV_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredH_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredDDL_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredDDR_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredVL_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredVR_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredHU_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderI4x4LumaPredHD_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredV_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredH_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredDC_neon(uint8_t *pPred, const int32_t kiStride);
void WelsDecoderIChromaPredPlane_neon(uint8_t *pPred, const int32_t kiStride);
#endif//HAVE_NEON
#if defined(__cplusplus)
}
#endif//__cplusplus

View File

@ -39,6 +39,7 @@
*/
#include "deblocking.h"
#include "deblocking_common.h"
#include "cpu_core.h"
namespace WelsDec {
@ -718,6 +719,20 @@ void DeblockingInit (SDeblockingFunc* pFunc, int32_t iCpu) {
}
#endif
#if defined(HAVE_NEON)
if ( iCpu & WELS_CPU_NEON )
{
pFunc->pfLumaDeblockingLT4Ver = DeblockLumaLt4V_neon;
pFunc->pfLumaDeblockingEQ4Ver = DeblockLumaEq4V_neon;
pFunc->pfLumaDeblockingLT4Hor = DeblockLumaLt4H_neon;
pFunc->pfLumaDeblockingEQ4Hor = DeblockLumaEq4H_neon;
pFunc->pfChromaDeblockingLT4Ver = DeblockChromaLt4V_neon;
pFunc->pfChromaDeblockingEQ4Ver = DeblockChromaEq4V_neon;
pFunc->pfChromaDeblockingLT4Hor = DeblockChromaLt4H_neon;
pFunc->pfChromaDeblockingEQ4Hor = DeblockChromaEq4H_neon;
}
#endif
}
} // namespace WelsDec

View File

@ -1148,6 +1148,14 @@ void WelsBlockFuncInit (SBlockFunc* pFunc, int32_t iCpu) {
pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_sse2;
}
#endif
#ifdef HAVE_NEON
if ( iCpu & WELS_CPU_NEON ) {
pFunc->pWelsBlockZero16x16Func = WelsResBlockZero16x16_neon;
pFunc->pWelsBlockZero8x8Func = WelsResBlockZero8x8_neon;
pFunc->pWelsSetNonZeroCountFunc = SetNonZeroCount_neon;
}
#endif
}
void WelsBlockZero16x16_c (int16_t* pBlock, int32_t iStride) {
WelsBlockInit (pBlock, 16, 16, iStride, 0);

View File

@ -146,7 +146,14 @@ void WelsDecoderDefaults (PWelsDecoderContext pCtx) {
#if defined(X86_ASM)
pCtx->uiCpuFlag = WelsCPUFeatureDetect (&iCpuCores);
#endif//X86_ASM
#elif defined(HAVE_NEON)
#if defined(ANDROID_NDK)
pCtx->uiCpuFlag = WelsCPUFeatureDetectAndroid();
#endif
#if defined(APPLE_IOS)
pCtx->uiCpuFlag = WelsCPUFeatureDetectIOS();
#endif
#endif
pCtx->iImgWidthInPixel = 0;
pCtx->iImgHeightInPixel = 0; // alloc picture data when picture size is available
@ -656,6 +663,32 @@ void AssignFuncPointerForRec (PWelsDecoderContext pCtx) {
InitDctClipTable();
pCtx->pIdctResAddPredFunc = IdctResAddPred_c;
#if defined(HAVE_NEON)
if ( pCtx->uiCpuFlag & WELS_CPU_NEON ) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_H] = WelsDecoderI16x16LumaPredH_neon;
pCtx->pGetI16x16LumaPredFunc[I16_PRED_V] = WelsDecoderI16x16LumaPredV_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_V ] = WelsDecoderI4x4LumaPredV_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_H ] = WelsDecoderI4x4LumaPredH_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDL ] = WelsDecoderI4x4LumaPredDDL_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_DDR ] = WelsDecoderI4x4LumaPredDDR_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_VL ] = WelsDecoderI4x4LumaPredVL_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_VR ] = WelsDecoderI4x4LumaPredVR_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_HU ] = WelsDecoderI4x4LumaPredHU_neon;
pCtx->pGetI4x4LumaPredFunc[I4_PRED_HD ] = WelsDecoderI4x4LumaPredHD_neon;
pCtx->pGetIChromaPredFunc[C_PRED_H] = WelsDecoderIChromaPredH_neon;
pCtx->pGetIChromaPredFunc[C_PRED_V] = WelsDecoderIChromaPredV_neon;
pCtx->pGetIChromaPredFunc[C_PRED_P ] = WelsDecoderIChromaPredPlane_neon;
pCtx->pGetIChromaPredFunc[C_PRED_DC] = WelsDecoderIChromaPredDC_neon;
}
#endif//HAVE_NEON
#if defined(X86_ASM)
if (pCtx->uiCpuFlag & WELS_CPU_MMXEXT) {
pCtx->pIdctResAddPredFunc = IdctResAddPred_mmx;

View File

@ -636,7 +636,7 @@ void McLuma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_
void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight) {
static const PMcChromaWidthExtFunc kpMcChromaWidthFuncs[2] = {
McChromaWidthEq4_mmx,
McChromaWidthEq4_mmx,
McChromaWidthEq8_sse2
};
const int32_t kiD8x = iMvX & 0x07;
@ -651,17 +651,336 @@ void McChroma_sse2 (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int3
McChromaWithFragMv_c (pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
}
#endif //X86_ASM
//***************************************************************************//
// NEON implementation //
//***************************************************************************//
#if defined(HAVE_NEON)
void McCopy_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (16 == iWidth)
McCopyWidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if(8 == iWidth)
McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else
McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer20_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer20WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer20WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer02_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer02WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer02WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer22_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer01_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer01WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer01WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer01WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer03_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer03WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer03WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer03WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer10_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer10WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer10WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer10WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer11_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
}
void McHorVer12_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
}
void McHorVer13_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq16_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq8_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq4_neon(pSrc, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
}
void McHorVer21_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
}
void McHorVer23_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pCtrTmp, iHeight);
}
}
void McHorVer30_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
if (iWidth == 16)
McHorVer30WidthEq16_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 8)
McHorVer30WidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if (iWidth == 4)
McHorVer30WidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
}
void McHorVer31_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
if (iWidth == 16) {
McHorVer20WidthEq16_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 8){
McHorVer20WidthEq8_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
}
void McHorVer32_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pCtrTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq16_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq8_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
McHorVer22WidthEq4_neon(pSrc, iSrcStride, pCtrTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pVerTmp, pCtrTmp, iHeight);
}
}
void McHorVer33_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int32_t iWidth, int32_t iHeight)
{
ENFORCE_STACK_ALIGN_1D( uint8_t, pHorTmp, 256, 16 );
ENFORCE_STACK_ALIGN_1D( uint8_t, pVerTmp, 256, 16 );
if (iWidth == 16)
{
McHorVer20WidthEq16_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq16_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq16_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 8)
{
McHorVer20WidthEq8_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq8_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq8_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
else if (iWidth == 4)
{
McHorVer20WidthEq4_neon(pSrc+iSrcStride, iSrcStride, pHorTmp, 16, iHeight);
McHorVer02WidthEq4_neon(pSrc+1, iSrcStride, pVerTmp, 16, iHeight);
PixelAvgWidthEq4_neon(pDst, iDstStride, pHorTmp, pVerTmp, iHeight);
}
}
void McLuma_neon(const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride,
int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
{
static PWelsMcWidthHeightFunc pWelsMcFunc[4][4] = //[x][y]
{
{McCopy_neon, McHorVer01_neon, McHorVer02_neon, McHorVer03_neon},
{McHorVer10_neon, McHorVer11_neon, McHorVer12_neon, McHorVer13_neon},
{McHorVer20_neon, McHorVer21_neon, McHorVer22_neon, McHorVer23_neon},
{McHorVer30_neon, McHorVer31_neon, McHorVer32_neon, McHorVer33_neon},
};
// pSrc += (iMvY >> 2) * iSrcStride + (iMvX >> 2);
pWelsMcFunc[iMvX&0x03][iMvY&0x03](pSrc, iSrcStride, pDst, iDstStride, iWidth, iHeight);
}
void McChroma_neon(const uint8_t *pSrc, int32_t iSrcStride, uint8_t *pDst, int32_t iDstStride,
int16_t iMvX, int16_t iMvY, int32_t iWidth, int32_t iHeight)
{
if (0 == iMvX && 0 == iMvY)
{
if(8 == iWidth)
McCopyWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else if(iWidth == 4)
McCopyWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, iHeight);
else //here iWidth == 2
McCopyWidthEq2_c(pSrc,iSrcStride,pDst,iDstStride,iHeight);
}
else
{
const int32_t kiD8x = iMvX & 0x07;
const int32_t kiD8y = iMvY & 0x07;
if(8 == iWidth)
McChromaWidthEq8_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
else if(4 == iWidth)
McChromaWidthEq4_neon(pSrc, iSrcStride, pDst, iDstStride, (int32_t*)(g_kuiABCD[kiD8y][kiD8x]), iHeight);
else //here iWidth == 2
McChromaWithFragMv_c(pSrc, iSrcStride, pDst, iDstStride, iMvX, iMvY, iWidth, iHeight);
}
}
#endif
void InitMcFunc (SMcFunc* pMcFunc, int32_t iCpu) {
pMcFunc->pMcLumaFunc = McLuma_c;
pMcFunc->pMcChromaFunc = McChroma_c;
#ifdef HAVE_NEON
if ( iCpu & WELS_CPU_NEON ) {
pMcFunc->pMcLumaFunc = McLuma_neon;
pMcFunc->pMcChromaFunc = McChroma_neon;
}
#endif
#if defined (X86_ASM)
if (iCpu & WELS_CPU_SSE2) {
pMcFunc->pMcLumaFunc = McLuma_sse2;
pMcFunc->pMcChromaFunc = McChroma_sse2;
pMcFunc->pMcLumaFunc = McLuma_sse2;
pMcFunc->pMcChromaFunc = McChroma_sse2;
}
#endif //(X86_ASM)
}