diff --git a/codec/common/src/copy_mb.cpp b/codec/common/src/copy_mb.cpp index 219edb50..96b2099d 100644 --- a/codec/common/src/copy_mb.cpp +++ b/codec/common/src/copy_mb.cpp @@ -51,8 +51,8 @@ void WelsCopy4x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStr const int32_t kiDstStride2 = iStrideD << 1; const int32_t kiDstStride3 = iStrideD + kiDstStride2; - ST32 (pDst, LD32 (pSrc)); - ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS)); + ST32 (pDst, LD32 (pSrc)); + ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS)); ST32 (pDst + kiDstStride2, LD32 (pSrc + kiSrcStride2)); ST32 (pDst + kiDstStride3, LD32 (pSrc + kiSrcStride3)); } @@ -67,10 +67,10 @@ void WelsCopy4x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStr void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) { int32_t i; for (i = 0; i < 4; i++) { - ST32 (pDst, LD32 (pSrc)); - ST32 (pDst + 4 , LD32 (pSrc + 4)); - ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS)); - ST32 (pDst + iStrideD + 4 , LD32 (pSrc + iStrideS + 4)); + ST32 (pDst, LD32 (pSrc)); + ST32 (pDst + 4 , LD32 (pSrc + 4)); + ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS)); + ST32 (pDst + iStrideD + 4 , LD32 (pSrc + iStrideS + 4)); pDst += iStrideD << 1; pSrc += iStrideS << 1; } @@ -78,10 +78,10 @@ void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStr void WelsCopy8x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) { int32_t i; for (i = 0; i < 8; ++i) { - ST32 (pDst, LD32 (pSrc)); - ST32 (pDst + 4 , LD32 (pSrc + 4)); - ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS)); - ST32 (pDst + iStrideD + 4 , LD32 (pSrc + iStrideS + 4)); + ST32 (pDst, LD32 (pSrc)); + ST32 (pDst + 4 , LD32 (pSrc + 4)); + ST32 (pDst + iStrideD, LD32 (pSrc + iStrideS)); + ST32 (pDst + iStrideD + 4 , LD32 (pSrc + iStrideS + 4)); pDst += iStrideD << 1; pSrc += iStrideS << 1; } @@ -89,10 +89,10 @@ void WelsCopy8x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iSt void WelsCopy16x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) { int32_t i; for (i = 0; i < 8; i++) { - ST32 (pDst, LD32 (pSrc)); - ST32 (pDst + 4 , LD32 (pSrc + 4)); - ST32 (pDst + 8 , LD32 (pSrc + 8)); - ST32 (pDst + 12 , LD32 (pSrc + 12)); + ST32 (pDst, LD32 (pSrc)); + ST32 (pDst + 4 , LD32 (pSrc + 4)); + ST32 (pDst + 8 , LD32 (pSrc + 8)); + ST32 (pDst + 12 , LD32 (pSrc + 12)); pDst += iStrideD ; pSrc += iStrideS; } @@ -100,10 +100,10 @@ void WelsCopy16x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iSt void WelsCopy16x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) { int32_t i; for (i = 0; i < 16; i++) { - ST32 (pDst, LD32 (pSrc)); - ST32 (pDst + 4 , LD32 (pSrc + 4)); - ST32 (pDst + 8 , LD32 (pSrc + 8)); - ST32 (pDst + 12 , LD32 (pSrc + 12)); + ST32 (pDst, LD32 (pSrc)); + ST32 (pDst + 4 , LD32 (pSrc + 4)); + ST32 (pDst + 8 , LD32 (pSrc + 8)); + ST32 (pDst + 12 , LD32 (pSrc + 12)); pDst += iStrideD ; pSrc += iStrideS; } diff --git a/codec/decoder/core/src/get_intra_predictor.cpp b/codec/decoder/core/src/get_intra_predictor.cpp index 1e307de3..1c73c508 100644 --- a/codec/decoder/core/src/get_intra_predictor.cpp +++ b/codec/decoder/core/src/get_intra_predictor.cpp @@ -54,10 +54,10 @@ namespace WelsDec { void WelsI4x4LumaPredV_c (uint8_t* pPred, const int32_t kiStride) { const uint32_t kuiVal = LD32A4 (pPred - kiStride); - ST32A4 (pPred , kuiVal); - ST32A4 (pPred + kiStride , kuiVal); - ST32A4 (pPred + (kiStride << 1) , kuiVal); - ST32A4 (pPred + (kiStride << 1) + kiStride , kuiVal); + ST32A4 (pPred , kuiVal); + ST32A4 (pPred + kiStride , kuiVal); + ST32A4 (pPred + (kiStride << 1) , kuiVal); + ST32A4 (pPred + (kiStride << 1) + kiStride , kuiVal); } void WelsI4x4LumaPredH_c (uint8_t* pPred, const int32_t kiStride) { @@ -68,7 +68,7 @@ void WelsI4x4LumaPredH_c (uint8_t* pPred, const int32_t kiStride) { const uint32_t kuiL2 = 0x01010101U * pPred[-1 + kiStride2]; const uint32_t kuiL3 = 0x01010101U * pPred[-1 + kiStride3]; - ST32A4 (pPred , kuiL0); + ST32A4 (pPred , kuiL0); ST32A4 (pPred + kiStride , kuiL1); ST32A4 (pPred + kiStride2, kuiL2); ST32A4 (pPred + kiStride3, kuiL3); @@ -81,7 +81,7 @@ void WelsI4x4LumaPredDc_c (uint8_t* pPred, const int32_t kiStride) { pPred[-kiStride] + pPred[-kiStride + 1] + pPred[-kiStride + 2] + pPred[-kiStride + 3] + 4) >> 3; const uint32_t kuiMean32 = 0x01010101U * kuiMean; - ST32A4 (pPred , kuiMean32); + ST32A4 (pPred , kuiMean32); ST32A4 (pPred + kiStride , kuiMean32); ST32A4 (pPred + kiStride2, kuiMean32); ST32A4 (pPred + kiStride3, kuiMean32); @@ -93,7 +93,7 @@ void WelsI4x4LumaPredDcLeft_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiMean = (pPred[-1] + pPred[-1 + kiStride] + pPred[-1 + kiStride2] + pPred[-1 + kiStride3] + 2) >> 2; const uint32_t kuiMean32 = 0x01010101U * kuiMean; - ST32A4 (pPred , kuiMean32); + ST32A4 (pPred , kuiMean32); ST32A4 (pPred + kiStride , kuiMean32); ST32A4 (pPred + kiStride2, kuiMean32); ST32A4 (pPred + kiStride3, kuiMean32); @@ -106,7 +106,7 @@ void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) { 2; const uint32_t kuiMean32 = 0x01010101U * kuiMean; - ST32A4 (pPred , kuiMean32); + ST32A4 (pPred , kuiMean32); ST32A4 (pPred + kiStride , kuiMean32); ST32A4 (pPred + kiStride2, kuiMean32); ST32A4 (pPred + kiStride3, kuiMean32); @@ -115,9 +115,9 @@ void WelsI4x4LumaPredDcTop_c (uint8_t* pPred, const int32_t kiStride) { void WelsI4x4LumaPredDcNA_c (uint8_t* pPred, const int32_t kiStride) { const uint32_t kuiDC32 = 0x80808080U; - ST32A4 (pPred , kuiDC32); - ST32A4 (pPred + kiStride , kuiDC32); - ST32A4 (pPred + (kiStride << 1) , kuiDC32); + ST32A4 (pPred , kuiDC32); + ST32A4 (pPred + kiStride , kuiDC32); + ST32A4 (pPred + (kiStride << 1) , kuiDC32); ST32A4 (pPred + (kiStride << 1) + kiStride, kuiDC32); } @@ -144,7 +144,7 @@ void WelsI4x4LumaPredDDL_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiDDL6 = (2 + kuiT6 + kuiT7 + (kuiT7 << 1)) >> 2; // kDDL6 const uint8_t kuiList[8] = { kuiDDL0, kuiDDL1, kuiDDL2, kuiDDL3, kuiDDL4, kuiDDL5, kuiDDL6, 0 }; - ST32A4 (pPred , LD32 (kuiList)); + ST32A4 (pPred , LD32 (kuiList)); ST32A4 (pPred + kiStride , LD32 (kuiList + 1)); ST32A4 (pPred + kiStride2, LD32 (kuiList + 2)); ST32A4 (pPred + kiStride3, LD32 (kuiList + 3)); @@ -170,7 +170,7 @@ void WelsI4x4LumaPredDDLTop_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiDLT3 = kuiT33 >> 1; // kDLT3 const uint8_t kuiList[8] = { kuiDLT0, kuiDLT1, kuiDLT2, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3 , kuiDLT3 }; - ST32A4 (pPred, LD32 (kuiList)); + ST32A4 (pPred, LD32 (kuiList)); ST32A4 (pPred + kiStride, LD32 (kuiList + 1)); ST32A4 (pPred + kiStride2, LD32 (kuiList + 2)); ST32A4 (pPred + kiStride3, LD32 (kuiList + 3)); @@ -210,7 +210,7 @@ void WelsI4x4LumaPredDDR_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiDDR6 = (kuiL12 + kuiL23) >> 2; // kuiDDR6 const uint8_t kuiList[8] = { kuiDDR6, kuiDDR5, kuiDDR4, kuiDDR0, kuiDDR1, kuiDDR2, kuiDDR3, 0 }; - ST32A4 (pPred , LD32 (kuiList + 3)); + ST32A4 (pPred , LD32 (kuiList + 3)); ST32A4 (pPred + kiStride , LD32 (kuiList + 2)); ST32A4 (pPred + kiStride2, LD32 (kuiList + 1)); ST32A4 (pPred + kiStride3, LD32 (kuiList)); @@ -248,7 +248,7 @@ void WelsI4x4LumaPredVL_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVL9 = (kuiT45 + kuiT56) >> 2; // kuiVL9 const uint8_t kuiList[10] = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL8, kuiVL9 }; - ST32A4 (pPred, LD32 (kuiList)); + ST32A4 (pPred, LD32 (kuiList)); ST32A4 (pPred + kiStride, LD32 (kuiList + 5)); ST32A4 (pPred + kiStride2, LD32 (kuiList + 1)); ST32A4 (pPred + kiStride3, LD32 (kuiList + 6)); @@ -278,7 +278,7 @@ void WelsI4x4LumaPredVLTop_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVL7 = kuiVL3; const uint8_t kuiList[10] = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL7 }; - ST32A4 (pPred , LD32 (kuiList)); + ST32A4 (pPred , LD32 (kuiList)); ST32A4 (pPred + kiStride , LD32 (kuiList + 5)); ST32A4 (pPred + kiStride2, LD32 (kuiList + 1)); ST32A4 (pPred + kiStride3, LD32 (kuiList + 6)); @@ -310,7 +310,7 @@ void WelsI4x4LumaPredVR_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVR9 = (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2; // kuiVR9 const uint8_t kuiList[10] = { kuiVR8, kuiVR0, kuiVR1, kuiVR2, kuiVR3, kuiVR9, kuiVR4, kuiVR5, kuiVR6, kuiVR7 }; - ST32A4 (pPred , LD32 (kuiList + 1)); + ST32A4 (pPred , LD32 (kuiList + 1)); ST32A4 (pPred + kiStride , LD32 (kuiList + 6)); ST32A4 (pPred + kiStride2, LD32 (kuiList)); ST32A4 (pPred + kiStride3, LD32 (kuiList + 5)); @@ -336,7 +336,7 @@ void WelsI4x4LumaPredHU_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiHU5 = (1 + kuiL23 + (kuiL3 << 1)) >> 2; const uint8_t kuiList[10] = { kuiHU0, kuiHU1, kuiHU2, kuiHU3, kuiHU4, kuiHU5, kuiL3, kuiL3, kuiL3, kuiL3 }; - ST32A4 (pPred , LD32 (kuiList)); + ST32A4 (pPred , LD32 (kuiList)); ST32A4 (pPred + kiStride , LD32 (kuiList + 2)); ST32A4 (pPred + kiStride2, LD32 (kuiList + 4)); ST32A4 (pPred + kiStride3, LD32 (kuiList + 6)); @@ -374,7 +374,7 @@ void WelsI4x4LumaPredHD_c (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiHD9 = (kuiL12 + kuiL23) >> 2; const uint8_t kuiList[10] = { kuiHD8, kuiHD9, kuiHD6, kuiHD7, kuiHD4, kuiHD5, kuiHD0, kuiHD1, kuiHD2, kuiHD3 }; - ST32A4 (pPred , LD32 (kuiList + 6)); + ST32A4 (pPred , LD32 (kuiList + 6)); ST32A4 (pPred + kiStride , LD32 (kuiList + 4)); ST32A4 (pPred + kiStride2, LD32 (kuiList + 2)); ST32A4 (pPred + kiStride3, LD32 (kuiList)); diff --git a/codec/encoder/core/src/decode_mb_aux.cpp b/codec/encoder/core/src/decode_mb_aux.cpp index abcb2642..a03f2a6c 100644 --- a/codec/encoder/core/src/decode_mb_aux.cpp +++ b/codec/encoder/core/src/decode_mb_aux.cpp @@ -189,8 +189,8 @@ void WelsIDctT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iP const int32_t kiVerDelR = (iTemp[4 + i] >> 1) - iTemp[12 + i]; const int32_t kiVerSumR = iTemp[4 + i] + (iTemp[12 + i] >> 1); - pRec[i ] = WelsClip1 (pPred[i ] + ((kiVerSumL + kiVerSumR + 32) >> 6)); - pRec[iStride + i ] = WelsClip1 (pPred[iPredStride + i ] + ((kiVerDelL + kiVerDelR + 32) >> 6)); + pRec[i ] = WelsClip1 (pPred[i ] + ((kiVerSumL + kiVerSumR + 32) >> 6)); + pRec[iStride + i ] = WelsClip1 (pPred[iPredStride + i ] + ((kiVerDelL + kiVerDelR + 32) >> 6)); pRec[iDstStridex2 + i] = WelsClip1 (pPred[iPredStridex2 + i] + ((kiVerDelL - kiVerDelR + 32) >> 6)); pRec[iDstStridex3 + i] = WelsClip1 (pPred[iPredStridex3 + i] + ((kiVerSumL - kiVerSumR + 32) >> 6)); } @@ -199,10 +199,10 @@ void WelsIDctT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iP void WelsIDctFourT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct) { int32_t iDstStridex4 = iStride << 2; int32_t iPredStridex4 = iPredStride << 2; - WelsIDctT4Rec_c (pRec, iStride, pPred, iPredStride, pDct); - WelsIDctT4Rec_c (&pRec[4], iStride, &pPred[4], iPredStride, pDct + 16); - WelsIDctT4Rec_c (&pRec[iDstStridex4 ], iStride, &pPred[iPredStridex4 ], iPredStride, pDct + 32); - WelsIDctT4Rec_c (&pRec[iDstStridex4 + 4], iStride, &pPred[iPredStridex4 + 4], iPredStride, pDct + 48); + WelsIDctT4Rec_c (pRec, iStride, pPred, iPredStride, pDct); + WelsIDctT4Rec_c (&pRec[4], iStride, &pPred[4], iPredStride, pDct + 16); + WelsIDctT4Rec_c (&pRec[iDstStridex4 ], iStride, &pPred[iPredStridex4 ], iPredStride, pDct + 32); + WelsIDctT4Rec_c (&pRec[iDstStridex4 + 4], iStride, &pPred[iPredStridex4 + 4], iPredStride, pDct + 48); } diff --git a/codec/encoder/core/src/md.cpp b/codec/encoder/core/src/md.cpp index bda32809..ca7cf004 100644 --- a/codec/encoder/core/src/md.cpp +++ b/codec/encoder/core/src/md.cpp @@ -446,15 +446,11 @@ int32_t AnalysisVaaInfoIntra_c (uint8_t* pDataY, const int32_t kiLineSize) { for (; j < 16; j += 4) { num = 0; for (i = 0; i < 16; i += 4, num ++) { - pBlock[num] = pEncData[i ] + pEncData[i + 1 ] + pEncData[i + 2 ] + pEncData[i + - 3 ]; - pBlock[num] += pEncData[i + kiLineSize ] + pEncData[i + kiLineSize + 1 ] + pEncData[i + kiLineSize + 2 ] + pEncData[i + - kiLineSize + 3 ]; - pBlock[num] += pEncData[i + kiLineSize2] + pEncData[i + kiLineSize2 + 1] + pEncData[i + kiLineSize2 + 2] + pEncData[i + - kiLineSize2 + 3]; - pBlock[num] += pEncData[i + kiLineSize3] + pEncData[i + kiLineSize3 + 1] + pEncData[i + kiLineSize3 + 2] + pEncData[i + - kiLineSize3 + 3]; - pBlock[num] >>= 4; + pBlock[num] = pEncData[i ] + pEncData[i + 1 ] + pEncData[i + 2 ] + pEncData[i + 3 ]; + pBlock[num] += pEncData[i + kiLineSize ] + pEncData[i + kiLineSize + 1] + pEncData[i + kiLineSize + 2] + pEncData[i + kiLineSize + 3]; + pBlock[num] += pEncData[i + kiLineSize2] + pEncData[i + kiLineSize2 + 1] + pEncData[i + kiLineSize2 + 2] + pEncData[i + kiLineSize2 + 3]; + pBlock[num] += pEncData[i + kiLineSize3] + pEncData[i + kiLineSize3 + 1] + pEncData[i + kiLineSize3 + 2] + pEncData[i + kiLineSize3 + 3]; + pBlock[num] >>= 4; } pBlock += 4; pEncData += kiLineSize4; diff --git a/codec/encoder/core/src/svc_encode_mb.cpp b/codec/encoder/core/src/svc_encode_mb.cpp index 86689ce3..39e2d7b7 100644 --- a/codec/encoder/core/src/svc_encode_mb.cpp +++ b/codec/encoder/core/src/svc_encode_mb.cpp @@ -45,10 +45,10 @@ namespace WelsEnc { void WelsDctMb (int16_t* pRes, uint8_t* pEncMb, int32_t iEncStride, uint8_t* pBestPred, PDctFunc pfDctFourT4) { - pfDctFourT4 (pRes, pEncMb, iEncStride, pBestPred, 16); - pfDctFourT4 (pRes + 64, pEncMb + 8, iEncStride, pBestPred + 8, 16); - pfDctFourT4 (pRes + 128, pEncMb + 8 * iEncStride, iEncStride, pBestPred + 128, 16); - pfDctFourT4 (pRes + 192, pEncMb + 8 * iEncStride + 8, iEncStride, pBestPred + 136, 16); + pfDctFourT4 (pRes, pEncMb, iEncStride, pBestPred, 16); + pfDctFourT4 (pRes + 64, pEncMb + 8, iEncStride, pBestPred + 8, 16); + pfDctFourT4 (pRes + 128, pEncMb + 8 * iEncStride, iEncStride, pBestPred + 128, 16); + pfDctFourT4 (pRes + 192, pEncMb + 8 * iEncStride + 8, iEncStride, pBestPred + 136, 16); } void WelsEncRecI16x16Y (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache) { @@ -77,7 +77,7 @@ void WelsEncRecI16x16Y (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache) { for (i = 0; i < 4; i++) { pFuncList->pfQuantizationFour4x4 (pRes, pFF, pMF); - pFuncList->pfScan4x4Ac (pBlock, pRes); + pFuncList->pfScan4x4Ac (pBlock, pRes); pFuncList->pfScan4x4Ac (pBlock + 16, pRes + 16); pFuncList->pfScan4x4Ac (pBlock + 32, pRes + 32); pFuncList->pfScan4x4Ac (pBlock + 48, pRes + 48); @@ -126,12 +126,12 @@ void WelsEncRecI16x16Y (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache) { pRes[224] = aDctT4Dc[14]; pRes[240] = aDctT4Dc[15]; - pFuncList->pfIDctFourT4 (pPred, kiRecStride, pBestPred, 16, pRes); - pFuncList->pfIDctFourT4 (pPred + 8, kiRecStride, pBestPred + 8, 16, pRes + 64); - pFuncList->pfIDctFourT4 (pPred + kiRecStride * 8, kiRecStride, pBestPred + 128, 16, pRes + 128); + pFuncList->pfIDctFourT4 (pPred, kiRecStride, pBestPred, 16, pRes); + pFuncList->pfIDctFourT4 (pPred + 8, kiRecStride, pBestPred + 8, 16, pRes + 64); + pFuncList->pfIDctFourT4 (pPred + kiRecStride * 8, kiRecStride, pBestPred + 128, 16, pRes + 128); pFuncList->pfIDctFourT4 (pPred + kiRecStride * 8 + 8, kiRecStride, pBestPred + 136, 16, pRes + 192); } else if (uiCountI16x16Dc > 0) { - pFuncList->pfIDctI16x16Dc (pPred, kiRecStride, pBestPred, 16, aDctT4Dc); + pFuncList->pfIDctI16x16Dc (pPred, kiRecStride, pBestPred, 16, aDctT4Dc); } else { pFuncList->pfCopy16x16Aligned (pPred, kiRecStride, pBestPred, 16); } @@ -319,10 +319,10 @@ void WelsRecPskip (SDqLayer* pCurLayer, SWelsFuncPtrList* pFuncList, SMB* pCu int32_t* iRecStride = pCurLayer->iCsStride; uint8_t** pCsMb = &pMbCache->SPicData.pCsMb[0]; - pFuncList->pfCopy16x16Aligned (pCsMb[0], *iRecStride++, pMbCache->pSkipMb, 16); - pFuncList->pfCopy8x8Aligned (pCsMb[1], *iRecStride++, pMbCache->pSkipMb + 256, 8); - pFuncList->pfCopy8x8Aligned (pCsMb[2], *iRecStride, pMbCache->pSkipMb + 320, 8); - pFuncList->pfSetMemZeroSize8 (pCurMb->pNonZeroCount, 24); + pFuncList->pfCopy16x16Aligned (pCsMb[0], *iRecStride++, pMbCache->pSkipMb, 16); + pFuncList->pfCopy8x8Aligned (pCsMb[1], *iRecStride++, pMbCache->pSkipMb + 256, 8); + pFuncList->pfCopy8x8Aligned (pCsMb[2], *iRecStride, pMbCache->pSkipMb + 320, 8); + pFuncList->pfSetMemZeroSize8 (pCurMb->pNonZeroCount, 24); } bool WelsTryPYskip (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache) { diff --git a/codec/encoder/core/src/svc_encode_slice.cpp b/codec/encoder/core/src/svc_encode_slice.cpp index 43ea7c55..09d9250c 100644 --- a/codec/encoder/core/src/svc_encode_slice.cpp +++ b/codec/encoder/core/src/svc_encode_slice.cpp @@ -448,8 +448,8 @@ void WelsPMbChromaEncode (sWelsEncCtx* pEncCtx, SSlice* pSlice, SMB* pCurMb) { int16_t* pCurRS = pMbCache->pCoeffLevel + 256; uint8_t* pBestPred = pMbCache->pMemPredChroma; - pFunc->pfDctFourT4 (pCurRS, pMbCache->SPicData.pEncMb[1], kiEncStride, pBestPred, 8); - pFunc->pfDctFourT4 (pCurRS + 64, pMbCache->SPicData.pEncMb[2], kiEncStride, pBestPred + 64, 8); + pFunc->pfDctFourT4 (pCurRS, pMbCache->SPicData.pEncMb[1], kiEncStride, pBestPred, 8); + pFunc->pfDctFourT4 (pCurRS + 64, pMbCache->SPicData.pEncMb[2], kiEncStride, pBestPred + 64, 8); WelsEncRecUV (pFunc, pCurMb, pMbCache, pCurRS, 1); WelsEncRecUV (pFunc, pCurMb, pMbCache, pCurRS + 64, 2); diff --git a/codec/processing/src/denoise/denoise_filter.cpp b/codec/processing/src/denoise/denoise_filter.cpp index 61e7b959..8a507a91 100644 --- a/codec/processing/src/denoise/denoise_filter.cpp +++ b/codec/processing/src/denoise/denoise_filter.cpp @@ -117,9 +117,9 @@ void Gauss3x3Filter (uint8_t* pSrc, int32_t iStride) { uint8_t* pCurLine2 = pCurLine1 + iStride; uint8_t* pCurLine3 = pCurLine2 + iStride; - nSum = pCurLine1[0] + (pCurLine1[1] << 1) + pCurLine1[2] + - (pCurLine2[0] << 1) + (pCurLine2[1] << 2) + (pCurLine2[2] << 1) + - pCurLine3[0] + (pCurLine3[1] << 1) + pCurLine3[2]; + nSum = pCurLine1[0] + (pCurLine1[1] << 1) + pCurLine1[2] + + (pCurLine2[0] << 1) + (pCurLine2[1] << 2) + (pCurLine2[2] << 1) + + pCurLine3[0] + (pCurLine3[1] << 1) + pCurLine3[2]; *pSrc = nSum >> 4; } diff --git a/test/decoder/DecUT_IntraPrediction.cpp b/test/decoder/DecUT_IntraPrediction.cpp index 2449dad8..953f484e 100644 --- a/test/decoder/DecUT_IntraPrediction.cpp +++ b/test/decoder/DecUT_IntraPrediction.cpp @@ -114,7 +114,7 @@ void WelsI4x4LumaPredDDL_ref (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiDDL6 = (2 + kuiT6 + kuiT7 + (kuiT7 << 1)) >> 2; // kDDL6 const uint8_t kuiList[8] = { kuiDDL0, kuiDDL1, kuiDDL2, kuiDDL3, kuiDDL4, kuiDDL5, kuiDDL6, 0 }; - ST32 (pPred , LD32 (kuiList)); + ST32 (pPred , LD32 (kuiList)); ST32 (pPred + kiStride , LD32 (kuiList + 1)); ST32 (pPred + kiStride2, LD32 (kuiList + 2)); ST32 (pPred + kiStride3, LD32 (kuiList + 3)); @@ -140,7 +140,7 @@ void WelsI4x4LumaPredDDLTop_ref (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiDLT3 = kuiT33 >> 1; // kDLT3 const uint8_t kuiList[8] = { kuiDLT0, kuiDLT1, kuiDLT2, kuiDLT3, kuiDLT3, kuiDLT3, kuiDLT3 , kuiDLT3 }; - ST32 (pPred, LD32 (kuiList)); + ST32 (pPred, LD32 (kuiList)); ST32 (pPred + kiStride, LD32 (kuiList + 1)); ST32 (pPred + kiStride2, LD32 (kuiList + 2)); ST32 (pPred + kiStride3, LD32 (kuiList + 3)); @@ -180,7 +180,7 @@ void WelsI4x4LumaPredDDR_ref (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiDDR6 = (kuiL12 + kuiL23) >> 2; // kuiDDR6 const uint8_t kuiList[8] = { kuiDDR6, kuiDDR5, kuiDDR4, kuiDDR0, kuiDDR1, kuiDDR2, kuiDDR3, 0 }; - ST32 (pPred , LD32 (kuiList + 3)); + ST32 (pPred , LD32 (kuiList + 3)); ST32 (pPred + kiStride , LD32 (kuiList + 2)); ST32 (pPred + kiStride2, LD32 (kuiList + 1)); ST32 (pPred + kiStride3, LD32 (kuiList)); @@ -218,7 +218,7 @@ void WelsI4x4LumaPredVL_ref (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVL9 = (kuiT45 + kuiT56) >> 2; // kuiVL9 const uint8_t kuiList[10] = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL8, kuiVL9 }; - ST32 (pPred, LD32 (kuiList)); + ST32 (pPred, LD32 (kuiList)); ST32 (pPred + kiStride, LD32 (kuiList + 5)); ST32 (pPred + kiStride2, LD32 (kuiList + 1)); ST32 (pPred + kiStride3, LD32 (kuiList + 6)); @@ -248,7 +248,7 @@ void WelsI4x4LumaPredVLTop_ref (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVL7 = kuiVL3; const uint8_t kuiList[10] = { kuiVL0, kuiVL1, kuiVL2, kuiVL3, kuiVL3, kuiVL4, kuiVL5, kuiVL6, kuiVL7, kuiVL7 }; - ST32 (pPred , LD32 (kuiList)); + ST32 (pPred , LD32 (kuiList)); ST32 (pPred + kiStride , LD32 (kuiList + 5)); ST32 (pPred + kiStride2, LD32 (kuiList + 1)); ST32 (pPred + kiStride3, LD32 (kuiList + 6)); @@ -280,7 +280,7 @@ void WelsI4x4LumaPredVR_ref (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiVR9 = (2 + kuiL0 + (kuiL1 << 1) + kuiL2) >> 2; // kuiVR9 const uint8_t kuiList[10] = { kuiVR8, kuiVR0, kuiVR1, kuiVR2, kuiVR3, kuiVR9, kuiVR4, kuiVR5, kuiVR6, kuiVR7 }; - ST32 (pPred , LD32 (kuiList + 1)); + ST32 (pPred , LD32 (kuiList + 1)); ST32 (pPred + kiStride , LD32 (kuiList + 6)); ST32 (pPred + kiStride2, LD32 (kuiList)); ST32 (pPred + kiStride3, LD32 (kuiList + 5)); @@ -306,7 +306,7 @@ void WelsI4x4LumaPredHU_ref (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiHU5 = (1 + kuiL23 + (kuiL3 << 1)) >> 2; const uint8_t kuiList[10] = { kuiHU0, kuiHU1, kuiHU2, kuiHU3, kuiHU4, kuiHU5, kuiL3, kuiL3, kuiL3, kuiL3 }; - ST32 (pPred , LD32 (kuiList)); + ST32 (pPred , LD32 (kuiList)); ST32 (pPred + kiStride , LD32 (kuiList + 2)); ST32 (pPred + kiStride2, LD32 (kuiList + 4)); ST32 (pPred + kiStride3, LD32 (kuiList + 6)); @@ -344,7 +344,7 @@ void WelsI4x4LumaPredHD_ref (uint8_t* pPred, const int32_t kiStride) { const uint8_t kuiHD9 = (kuiL12 + kuiL23) >> 2; const uint8_t kuiList[10] = { kuiHD8, kuiHD9, kuiHD6, kuiHD7, kuiHD4, kuiHD5, kuiHD0, kuiHD1, kuiHD2, kuiHD3 }; - ST32 (pPred , LD32 (kuiList + 6)); + ST32 (pPred , LD32 (kuiList + 6)); ST32 (pPred + kiStride , LD32 (kuiList + 4)); ST32 (pPred + kiStride2, LD32 (kuiList + 2)); ST32 (pPred + kiStride3, LD32 (kuiList));