From 039a54780478b233626007993b49d43b51709ff1 Mon Sep 17 00:00:00 2001 From: Licai Guo Date: Sat, 19 Apr 2014 00:33:23 -0700 Subject: [PATCH 1/2] give accurate align information for mc copy functions this can improve the performance for target like javascript --- codec/common/inc/ls_defines.h | 37 +++++++++++++++++++++++++++++++++++ codec/decoder/core/src/mc.cpp | 12 ++++++------ 2 files changed, 43 insertions(+), 6 deletions(-) diff --git a/codec/common/inc/ls_defines.h b/codec/common/inc/ls_defines.h index 480d745b..dacd0eba 100644 --- a/codec/common/inc/ls_defines.h +++ b/codec/common/inc/ls_defines.h @@ -51,11 +51,36 @@ struct tagUnaligned_16 { #define LD16(a) (((struct tagUnaligned_16 *) (a))->l) #define LD32(a) (((struct tagUnaligned_32 *) (a))->l) #define LD64(a) (((struct tagUnaligned_64 *) (a))->l) + +#define STRUCTA(size, align) struct tagUnaligned_##size##_##align {\ + uint##size##_t l; \ +} __attribute__ ((aligned(align))) +STRUCTA(16,2); +STRUCTA(32,2); +STRUCTA(32,4); +STRUCTA(64,2); +STRUCTA(64,4); +STRUCTA(64,8); //#define _USE_STRUCT_INT_CVT // #ifdef _USE_STRUCT_INT_CVT #define ST16(a, b) (((struct tagUnaligned_16 *) (a))->l) = (b) #define ST32(a, b) (((struct tagUnaligned_32 *) (a))->l) = (b) #define ST64(a, b) (((struct tagUnaligned_64 *) (a))->l) = (b) + +#define LDA(a, size, align) (((struct tagUnaligned_##size##_##align *) (a))->l) +#define STA(a, b, size, align) (((struct tagUnaligned_##size##_##align *) (a))->l) = (b) +#define LD16A2(a) LDA(a, 16, 2) +#define LD32A2(a) LDA(a, 32, 2) +#define LD32A4(a) LDA(a, 32, 4) +#define LD64A2(a) LDA(a, 64, 2) +#define LD64A4(a) LDA(a, 64, 4) +#define LD64A8(a) LDA(a, 64, 8) +#define ST16A2(a, b) STA(a, b, 16, 2) +#define ST32A2(a, b) STA(a, b, 32, 2) +#define ST32A4(a, b) STA(a, b, 32, 4) +#define ST64A2(a, b) STA(a, b, 64, 2) +#define ST64A4(a, b) STA(a, b, 64, 4) +#define ST64A8(a, b) STA(a, b, 64, 8) // #else // inline void __ST16(void *dst, uint16_t v) { memcpy(dst, &v, 2); } // inline void __ST32(void *dst, uint32_t v) { memcpy(dst, &v, 4); } @@ -75,6 +100,18 @@ struct tagUnaligned_16 { #define ST16(a, b) *((uint16_t*)(a)) = (b) #define ST32(a, b) *((uint32_t*)(a)) = (b) #define ST64(a, b) *((uint64_t*)(a)) = (b) +#define LD16A2 LD16 +#define LD32A2 LD32 +#define LD32A4 LD32 +#define LD64A2 LD64 +#define LD64A4 LD64 +#define LD64A8 LD64 +#define ST16A2 ST16 +#define ST32A2 ST32 +#define ST32A4 ST32 +#define ST64A2 ST64 +#define ST64A4 ST64 +#define ST64A8 ST64 #endif /* !__GNUC__ */ diff --git a/codec/decoder/core/src/mc.cpp b/codec/decoder/core/src/mc.cpp index 80ff5b4d..defb2ebd 100644 --- a/codec/decoder/core/src/mc.cpp +++ b/codec/decoder/core/src/mc.cpp @@ -94,7 +94,7 @@ static inline void McCopyWidthEq2_c (const uint8_t* pSrc, int32_t iSrcStride, ui int32_t iHeight) { int32_t i; for (i = 0; i < iHeight; i++) { // iWidth == 2 only for chroma - ST16 (pDst, LD16 (pSrc)); + ST16A2 (pDst, LD16 (pSrc)); pDst += iDstStride; pSrc += iSrcStride; } @@ -104,7 +104,7 @@ static inline void McCopyWidthEq4_c (const uint8_t* pSrc, int32_t iSrcStride, ui int32_t iHeight) { int32_t i; for (i = 0; i < iHeight; i++) { - ST32 (pDst, LD32 (pSrc)); + ST32A4 (pDst, LD32 (pSrc)); pDst += iDstStride; pSrc += iSrcStride; } @@ -114,7 +114,7 @@ static inline void McCopyWidthEq8_c (const uint8_t* pSrc, int32_t iSrcStride, ui int32_t iHeight) { int32_t i; for (i = 0; i < iHeight; i++) { - ST64 (pDst, LD64 (pSrc)); + ST64A8 (pDst, LD64 (pSrc)); pDst += iDstStride; pSrc += iSrcStride; } @@ -124,8 +124,8 @@ static inline void McCopyWidthEq16_c (const uint8_t* pSrc, int32_t iSrcStride, u int32_t iHeight) { int32_t i; for (i = 0; i < iHeight; i++) { - ST64 (pDst , LD64 (pSrc)); - ST64 (pDst + 8, LD64 (pSrc + 8)); + ST64A8 (pDst , LD64 (pSrc)); + ST64A8 (pDst + 8, LD64 (pSrc + 8)); pDst += iDstStride; pSrc += iSrcStride; } @@ -202,7 +202,7 @@ static inline void McHorVer02_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_ static inline void McHorVer22_c (const uint8_t* pSrc, int32_t iSrcStride, uint8_t* pDst, int32_t iDstStride, int32_t iWidth, int32_t iHeight) { - int16_t iTmp[16 + 5] = {0}; //16 + int16_t iTmp[16 + 5]; //16 int32_t i, j, k; for (i = 0; i < iHeight; i++) { From 5ba3ead01527e5f932c82252540e495029468833 Mon Sep 17 00:00:00 2001 From: Licai Guo Date: Sat, 19 Apr 2014 00:56:00 -0700 Subject: [PATCH 2/2] specify accurate align information for intrapred c functions --- .../decoder/core/src/get_intra_predictor.cpp | 190 +++++++++--------- 1 file changed, 95 insertions(+), 95 deletions(-) diff --git a/codec/decoder/core/src/get_intra_predictor.cpp b/codec/decoder/core/src/get_intra_predictor.cpp index 76b17815..5dec7c6f 100644 --- a/codec/decoder/core/src/get_intra_predictor.cpp +++ b/codec/decoder/core/src/get_intra_predictor.cpp @@ -52,12 +52,12 @@ namespace WelsDec { #define I16x16_COUNT 16 void WelsI4x4LumaPredV_c (uint8_t* pPred, const int32_t kiStride) { - const uint32_t kuiVal = LD32 (pPred - kiStride); + const uint32_t kuiVal = LD32A4 (pPred - kiStride); - ST32 (pPred , kuiVal); - ST32 (pPred + kiStride , kuiVal); - ST32 (pPred + (kiStride << 1) , kuiVal); - ST32 (pPred + (kiStride << 1) + kiStride , kuiVal); + ST32A4 (pPred , kuiVal); + ST32A4 (pPred + kiStride , kuiVal); + ST32A4 (pPred + (kiStride << 1) , kuiVal); + ST32A4 (pPred + (kiStride << 1) + kiStride , kuiVal); } void WelsI4x4LumaPredH_c (uint8_t* pPred, const int32_t kiStride) { @@ -68,10 +68,10 @@ void WelsI4x4LumaPredH_c (uint8_t* pPred, const int32_t kiStride) { const uint32_t kuiL2 = 0x01010101U * pPred[-1 + kiStride2]; const uint32_t kuiL3 = 0x01010101U * pPred[-1 + kiStride3]; - ST32 (pPred , kuiL0); - ST32 (pPred + kiStride , kuiL1); - ST32 (pPred + kiStride2, kuiL2); - ST32 (pPred + kiStride3, kuiL3); + ST32A4 (pPred , kuiL0); + ST32A4 (pPred + kiStride , kuiL1); + ST32A4 (pPred + kiStride2, kuiL2); + ST32A4 (pPred + kiStride3, kuiL3); } void WelsI4x4LumaPredDc_c (uint8_t* pPred, const int32_t kiStride) { @@ -81,10 +81,10 @@ void WelsI4x4LumaPredDc_c (uint8_t* pPred, const int32_t kiStride) { pPred[-kiStride] + pPred[-kiStride + 1] + pPred[-kiStride + 2] + pPred[-kiStride + 3] + 4) >> 3; const uint32_t kuiMean32 = 0x01010101U * kuiMean; - ST32 (pPred , kuiMean32); - ST32 (pPred + kiStride , kuiMean32); - ST32 (pPred + kiStride2, kuiMean32); - ST32 (pPred + kiStride3, kuiMean32); + ST32A4 (pPred , kuiMean32); + ST32A4 (pPred + kiStride , kuiMean32); + ST32A4 (pPred + kiStride2, kuiMean32); + ST32A4 (pPred + kiStride3, kuiMean32); } void WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) { @@ -93,10 +93,10 @@ void WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiMean = (pPred[-1] + pPred[-1 + kiStride] + pPred[-1 + kiStride2] + pPred[-1 + kiStride3] + 2) >> 2; const uint32_t kuiMean32 = 0x01010101U * kuiMean; - ST32 (pPred , kuiMean32); - ST32 (pPred + kiStride , kuiMean32); - ST32 (pPred + kiStride2, kuiMean32); - ST32 (pPred + kiStride3, kuiMean32); + ST32A4 (pPred , kuiMean32); + ST32A4 (pPred + kiStride , kuiMean32); + ST32A4 (pPred + kiStride2, kuiMean32); + ST32A4 (pPred + kiStride3, kuiMean32); } void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) { @@ -106,19 +106,19 @@ void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) { 2; const uint32_t kuiMean32 = 0x01010101U * kuiMean; - ST32 (pPred , kuiMean32); - ST32 (pPred + kiStride , kuiMean32); - ST32 (pPred + kiStride2, kuiMean32); - ST32 (pPred + kiStride3, kuiMean32); + ST32A4 (pPred , kuiMean32); + ST32A4 (pPred + kiStride , kuiMean32); + ST32A4 (pPred + kiStride2, kuiMean32); + ST32A4 (pPred + kiStride3, kuiMean32); } void WelsI4x4LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) { const uint32_t kuiDC32 = 0x80808080U; - ST32 (pPred , kuiDC32); - ST32 (pPred + kiStride , kuiDC32); - ST32 (pPred + (kiStride << 1) , kuiDC32); - ST32 (pPred + (kiStride << 1) + kiStride, kuiDC32); + ST32A4 (pPred , kuiDC32); + ST32A4 (pPred + kiStride , kuiDC32); + ST32A4 (pPred + (kiStride << 1) , kuiDC32); + ST32A4 (pPred + (kiStride << 1) + kiStride, kuiDC32); } /*down pLeft*/ @@ -144,10 +144,10 @@ void WelsI4x4LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiDDL6 = (2 + kuiT6 + kuiT7 + (kuiT7 << 1)) >> 2; // kDDL6 const uint8_t kuiList[8] = { kuiDDL0, kuiDDL1, kuiDDL2, kuiDDL3, kuiDDL4, kuiDDL5, kuiDDL6, 0 }; - ST32 (pPred , LD32 (kuiList)); - ST32 (pPred + kiStride , LD32 (kuiList + 1)); - ST32 (pPred + kiStride2, LD32 (kuiList + 2)); - ST32 (pPred + kiStride3, LD32 (kuiList + 3)); + ST32A4 (pPred , LD32 (kuiList)); + ST32A4 (pPred + kiStride , LD32 (kuiList + 1)); + ST32A4 (pPred + kiStride2, LD32 (kuiList + 2)); + ST32A4 (pPred + kiStride3, LD32 (kuiList + 3)); } /*down pLeft*/ @@ -170,10 +170,10 @@ void WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiDLT3 = kuiT33 >> 1; // kDLT3 const uint8_t kuiList[8] = { kuiDLT0, kuiDLT1, kuiDLT2, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3 , kuiDLT3 }; - ST32 (pPred, LD32 (kuiList)); - ST32 (pPred + kiStride, LD32 (kuiList + 1)); - ST32 (pPred + kiStride2, LD32 (kuiList + 2)); - ST32 (pPred + kiStride3, LD32 (kuiList + 3)); + ST32A4 (pPred, LD32 (kuiList)); + ST32A4 (pPred + kiStride, LD32 (kuiList + 1)); + ST32A4 (pPred + kiStride2, LD32 (kuiList + 2)); + ST32A4 (pPred + kiStride3, LD32 (kuiList + 3)); } @@ -210,10 +210,10 @@ void WelsI4x4LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiDDR6 = (kuiL12 + kuiL23) >> 2; // kuiDDR6 const uint8_t kuiList[8] = { kuiDDR6, kuiDDR5, kuiDDR4, kuiDDR0, kuiDDR1, kuiDDR2, kuiDDR3, 0 }; - ST32 (pPred , LD32 (kuiList + 3)); - ST32 (pPred + kiStride , LD32 (kuiList + 2)); - ST32 (pPred + kiStride2, LD32 (kuiList + 1)); - ST32 (pPred + kiStride3, LD32 (kuiList)); + ST32A4 (pPred , LD32 (kuiList + 3)); + ST32A4 (pPred + kiStride , LD32 (kuiList + 2)); + ST32A4 (pPred + kiStride2, LD32 (kuiList + 1)); + ST32A4 (pPred + kiStride3, LD32 (kuiList)); } @@ -248,10 +248,10 @@ void WelsI4x4LumaPredVL_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVL9 = (kuiT45 + kuiT56) >> 2; // kuiVL9 const uint8_t kuiList[10] = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL8, kuiVL9 }; - ST32 (pPred, LD32 (kuiList)); - ST32 (pPred + kiStride, LD32 (kuiList + 5)); - ST32 (pPred + kiStride2, LD32 (kuiList + 1)); - ST32 (pPred + kiStride3, LD32 (kuiList + 6)); + ST32A4 (pPred, LD32 (kuiList)); + ST32A4 (pPred + kiStride, LD32 (kuiList + 5)); + ST32A4 (pPred + kiStride2, LD32 (kuiList + 1)); + ST32A4 (pPred + kiStride3, LD32 (kuiList + 6)); } /*vertical pLeft*/ @@ -278,10 +278,10 @@ void WelsI4x4LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVL7 = kuiVL3; const uint8_t kuiList[10] = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL7 }; - ST32 (pPred , LD32 (kuiList)); - ST32 (pPred + kiStride , LD32 (kuiList + 5)); - ST32 (pPred + kiStride2, LD32 (kuiList + 1)); - ST32 (pPred + kiStride3, LD32 (kuiList + 6)); + ST32A4 (pPred , LD32 (kuiList)); + ST32A4 (pPred + kiStride , LD32 (kuiList + 5)); + ST32A4 (pPred + kiStride2, LD32 (kuiList + 1)); + ST32A4 (pPred + kiStride3, LD32 (kuiList + 6)); } @@ -310,10 +310,10 @@ void WelsI4x4LumaPredVR_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVR9 = (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2; // kuiVR9 const uint8_t kuiList[10] = { kuiVR8, kuiVR0, kuiVR1, kuiVR2, kuiVR3, kuiVR9, kuiVR4, kuiVR5, kuiVR6, kuiVR7 }; - ST32 (pPred , LD32 (kuiList + 1)); - ST32 (pPred + kiStride , LD32 (kuiList + 6)); - ST32 (pPred + kiStride2, LD32 (kuiList)); - ST32 (pPred + kiStride3, LD32 (kuiList + 5)); + ST32A4 (pPred , LD32 (kuiList + 1)); + ST32A4 (pPred + kiStride , LD32 (kuiList + 6)); + ST32A4 (pPred + kiStride2, LD32 (kuiList)); + ST32A4 (pPred + kiStride3, LD32 (kuiList + 5)); } /*horizontal up*/ @@ -336,10 +336,10 @@ void WelsI4x4LumaPredHU_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiHU5 = (1 + kuiL23 + (kuiL3 << 1)) >> 2; const uint8_t kuiList[10] = { kuiHU0, kuiHU1, kuiHU2, kuiHU3, kuiHU4, kuiHU5, kuiL3, kuiL3, kuiL3, kuiL3 }; - ST32 (pPred , LD32 (kuiList)); - ST32 (pPred + kiStride , LD32 (kuiList + 2)); - ST32 (pPred + kiStride2, LD32 (kuiList + 4)); - ST32 (pPred + kiStride3, LD32 (kuiList + 6)); + ST32A4 (pPred , LD32 (kuiList)); + ST32A4 (pPred + kiStride , LD32 (kuiList + 2)); + ST32A4 (pPred + kiStride2, LD32 (kuiList + 4)); + ST32A4 (pPred + kiStride3, LD32 (kuiList + 6)); } /*horizontal down*/ @@ -374,25 +374,25 @@ void WelsI4x4LumaPredHD_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiHD9 = (kuiL12 + kuiL23) >> 2; const uint8_t kuiList[10] = { kuiHD8, kuiHD9, kuiHD6, kuiHD7, kuiHD4, kuiHD5, kuiHD0, kuiHD1, kuiHD2, kuiHD3 }; - ST32 (pPred , LD32 (kuiList + 6)); - ST32 (pPred + kiStride , LD32 (kuiList + 4)); - ST32 (pPred + kiStride2, LD32 (kuiList + 2)); - ST32 (pPred + kiStride3, LD32 (kuiList)); + ST32A4 (pPred , LD32 (kuiList + 6)); + ST32A4 (pPred + kiStride , LD32 (kuiList + 4)); + ST32A4 (pPred + kiStride2, LD32 (kuiList + 2)); + ST32A4 (pPred + kiStride3, LD32 (kuiList)); } void WelsIChromaPredV_c (uint8_t* pPred, const int32_t kiStride) { - const uint64_t kuiVal64 = LD64 (&pPred[-kiStride]); + const uint64_t kuiVal64 = LD64A8 (&pPred[-kiStride]); const int32_t kiStride2 = kiStride << 1; const int32_t kiStride4 = kiStride2 << 1; - ST64 (pPred , kuiVal64); - ST64 (pPred + kiStride , kuiVal64); - ST64 (pPred + kiStride2 , kuiVal64); - ST64 (pPred + kiStride2 + kiStride , kuiVal64); - ST64 (pPred + kiStride4 , kuiVal64); - ST64 (pPred + kiStride4 + kiStride , kuiVal64); - ST64 (pPred + kiStride4 + kiStride2 , kuiVal64); - ST64 (pPred + (kiStride << 3) - kiStride , kuiVal64); + ST64A8 (pPred , kuiVal64); + ST64A8 (pPred + kiStride , kuiVal64); + ST64A8 (pPred + kiStride2 , kuiVal64); + ST64A8 (pPred + kiStride2 + kiStride , kuiVal64); + ST64A8 (pPred + kiStride4 , kuiVal64); + ST64A8 (pPred + kiStride4 + kiStride , kuiVal64); + ST64A8 (pPred + kiStride4 + kiStride2 , kuiVal64); + ST64A8 (pPred + (kiStride << 3) - kiStride , kuiVal64); } void WelsIChromaPredH_c (uint8_t* pPred, const int32_t kiStride) { @@ -403,7 +403,7 @@ void WelsIChromaPredH_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVal8 = pPred[iTmp - 1]; const uint64_t kuiVal64 = 0x0101010101010101ULL * kuiVal8; - ST64 (pPred + iTmp, kuiVal64); + ST64A8 (pPred + iTmp, kuiVal64); iTmp -= kiStride; } while (i-- > 0); @@ -457,14 +457,14 @@ void WelsIChromaPredDc_c (uint8_t* pPred, const int32_t kiStride) { const uint64_t kuiUP64 = LD64 (kuiMUP); const uint64_t kuiDN64 = LD64 (kuiMDown); - ST64 (pPred , kuiUP64); - ST64 (pPred + kiL1 + 1, kuiUP64); - ST64 (pPred + kiL2 + 1, kuiUP64); - ST64 (pPred + kiL3 + 1, kuiUP64); - ST64 (pPred + kiL4 + 1, kuiDN64); - ST64 (pPred + kiL5 + 1, kuiDN64); - ST64 (pPred + kiL6 + 1, kuiDN64); - ST64 (pPred + kiL7 + 1, kuiDN64); + ST64A8 (pPred , kuiUP64); + ST64A8 (pPred + kiL1 + 1, kuiUP64); + ST64A8 (pPred + kiL2 + 1, kuiUP64); + ST64A8 (pPred + kiL3 + 1, kuiUP64); + ST64A8 (pPred + kiL4 + 1, kuiDN64); + ST64A8 (pPred + kiL5 + 1, kuiDN64); + ST64A8 (pPred + kiL6 + 1, kuiDN64); + ST64A8 (pPred + kiL7 + 1, kuiDN64); } void WelsIChromaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) { @@ -481,14 +481,14 @@ void WelsIChromaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) { const uint64_t kuiUP64 = 0x0101010101010101ULL * kuiMUP; const uint64_t kuiDN64 = 0x0101010101010101ULL * kuiMDown; - ST64 (pPred , kuiUP64); - ST64 (pPred + kiL1 + 1, kuiUP64); - ST64 (pPred + kiL2 + 1, kuiUP64); - ST64 (pPred + kiL3 + 1, kuiUP64); - ST64 (pPred + kiL4 + 1, kuiDN64); - ST64 (pPred + kiL5 + 1, kuiDN64); - ST64 (pPred + kiL6 + 1, kuiDN64); - ST64 (pPred + kiL7 + 1, kuiDN64); + ST64A8 (pPred , kuiUP64); + ST64A8 (pPred + kiL1 + 1, kuiUP64); + ST64A8 (pPred + kiL2 + 1, kuiUP64); + ST64A8 (pPred + kiL3 + 1, kuiUP64); + ST64A8 (pPred + kiL4 + 1, kuiDN64); + ST64A8 (pPred + kiL5 + 1, kuiDN64); + ST64A8 (pPred + kiL6 + 1, kuiDN64); + ST64A8 (pPred + kiL7 + 1, kuiDN64); } void WelsIChromaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) { @@ -502,7 +502,7 @@ void WelsIChromaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) { uint8_t i = 7; do { - ST64 (pPred + iTmp, LD64 (kuiM)); + ST64A8 (pPred + iTmp, LD64 (kuiM)); iTmp -= kiStride; } while (i-- > 0); @@ -514,7 +514,7 @@ void WelsIChromaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) { uint8_t i = 7; do { - ST64 (pPred + iTmp, kuiDC64); + ST64A8 (pPred + iTmp, kuiDC64); iTmp -= kiStride; } while (i-- > 0); @@ -522,13 +522,13 @@ void WelsIChromaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) { void WelsI16x16LumaPredV_c (uint8_t* pPred, const int32_t kiStride) { int32_t iTmp = (kiStride << 4) - kiStride; - const uint64_t kuiTop1 = LD64 (pPred - kiStride); - const uint64_t kuiTop2 = LD64 (pPred - kiStride + 8); + const uint64_t kuiTop1 = LD64A8 (pPred - kiStride); + const uint64_t kuiTop2 = LD64A8 (pPred - kiStride + 8); uint8_t i = 15; do { - ST64 (pPred + iTmp , kuiTop1); - ST64 (pPred + iTmp + 8, kuiTop2); + ST64A8 (pPred + iTmp , kuiTop1); + ST64A8 (pPred + iTmp + 8, kuiTop2); iTmp -= kiStride; } while (i-- > 0); @@ -542,8 +542,8 @@ void WelsI16x16LumaPredH_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVal8 = pPred[iTmp - 1]; const uint64_t kuiVal64 = 0x0101010101010101ULL * kuiVal8; - ST64 (pPred + iTmp , kuiVal64); - ST64 (pPred + iTmp + 8, kuiVal64); + ST64A8 (pPred + iTmp , kuiVal64); + ST64A8 (pPred + iTmp + 8, kuiVal64); iTmp -= kiStride; } while (i-- > 0); @@ -633,8 +633,8 @@ void WelsI16x16LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) { iTmp = (kiStride << 4) - kiStride; i = 15; do { - ST64 (pPred + iTmp , uiMean64); - ST64 (pPred + iTmp + 8, uiMean64); + ST64A8 (pPred + iTmp , uiMean64); + ST64A8 (pPred + iTmp + 8, uiMean64); iTmp -= kiStride; } while (i-- > 0); @@ -646,8 +646,8 @@ void WelsI16x16LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) { uint8_t i = 15; do { - ST64 (pPred + iTmp, kuiDC64); - ST64 (pPred + iTmp + 8, kuiDC64); + ST64A8 (pPred + iTmp, kuiDC64); + ST64A8 (pPred + iTmp + 8, kuiDC64); iTmp -= kiStride; } while (i-- > 0);