Fix shifting tricks in deblocking for big endian

The code interprets an array of 4 uint8_t values as one uint32_t
and does shifts on the value. The same optimization can be
kept in big endian as well, but the shift has to be done in the
other direction.

This code could be made truly independent of endianness, but
that could cause some minimal performance degradaion, at least
in theory.

This makes "make test" pass on big endian, assuming that
WORDS_BIGENDIAN is defined while building.
This commit is contained in:
Martin Storsjö
2014-01-29 12:49:38 +02:00
parent 47ec78843a
commit cadbec75d8
3 changed files with 22 additions and 16 deletions

View File

@@ -15,6 +15,12 @@ void_t DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, i
int8_t* pTc);
void_t DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta);
#ifdef WORDS_BIGENDIAN
#define DEBLOCK_BS_SHIFTED(x) ((x) | ((x) << 8))
#else
#define DEBLOCK_BS_SHIFTED(x) ((x) | ((x) >> 8))
#endif
#if defined(__cplusplus)
extern "C" {
#endif//__cplusplus

View File

@@ -152,24 +152,24 @@ void_t inline DeblockingBSInsideMBAvsbase (int8_t* pNnzTab, uint8_t nBS[2][4][4]
uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
* (uint32_t*)uiBsx3 = (uiNnz32b0 | (uiNnz32b0 >> 8)) << iLShiftFactor;
* (uint32_t*)uiBsx3 = DEBLOCK_BS_SHIFTED (uiNnz32b0) << iLShiftFactor;
nBS[0][1][0] = uiBsx3[0];
nBS[0][2][0] = uiBsx3[1];
nBS[0][3][0] = uiBsx3[2];
* (uint32_t*)uiBsx3 = (uiNnz32b1 | (uiNnz32b1 >> 8)) << iLShiftFactor;
* (uint32_t*)uiBsx3 = DEBLOCK_BS_SHIFTED (uiNnz32b1) << iLShiftFactor;
nBS[0][1][1] = uiBsx3[0];
nBS[0][2][1] = uiBsx3[1];
nBS[0][3][1] = uiBsx3[2];
* (uint32_t*)nBS[1][1] = (uiNnz32b0 | uiNnz32b1) << iLShiftFactor;
* (uint32_t*)uiBsx3 = (uiNnz32b2 | (uiNnz32b2 >> 8)) << iLShiftFactor;
* (uint32_t*)uiBsx3 = DEBLOCK_BS_SHIFTED (uiNnz32b2) << iLShiftFactor;
nBS[0][1][2] = uiBsx3[0];
nBS[0][2][2] = uiBsx3[1];
nBS[0][3][2] = uiBsx3[2];
* (uint32_t*)nBS[1][2] = (uiNnz32b1 | uiNnz32b2) << iLShiftFactor;
* (uint32_t*)uiBsx3 = (uiNnz32b3 | (uiNnz32b3 >> 8)) << iLShiftFactor;
* (uint32_t*)uiBsx3 = DEBLOCK_BS_SHIFTED (uiNnz32b3) << iLShiftFactor;
nBS[0][1][3] = uiBsx3[0];
nBS[0][2][3] = uiBsx3[1];
nBS[0][3][3] = uiBsx3[2];
@@ -188,22 +188,22 @@ void_t static inline DeblockingBSInsideMBNormal (PDqLayer pCurDqLayer, uint8_t n
uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
* (uint32_t*)uiBsx4 = (uiNnz32b0 | (uiNnz32b0 >> 8));
* (uint32_t*)uiBsx4 = DEBLOCK_BS_SHIFTED (uiNnz32b0);
nBS[0][1][0] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 1, 0);
nBS[0][2][0] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 2, 1);
nBS[0][3][0] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 3, 2);
* (uint32_t*)uiBsx4 = (uiNnz32b1 | (uiNnz32b1 >> 8));
* (uint32_t*)uiBsx4 = DEBLOCK_BS_SHIFTED (uiNnz32b1);
nBS[0][1][1] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 5, 4);
nBS[0][2][1] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 6, 5);
nBS[0][3][1] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 7, 6);
* (uint32_t*)uiBsx4 = (uiNnz32b2 | (uiNnz32b2 >> 8));
* (uint32_t*)uiBsx4 = DEBLOCK_BS_SHIFTED (uiNnz32b2);
nBS[0][1][2] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 9, 8);
nBS[0][2][2] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 10, 9);
nBS[0][3][2] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 11, 10);
* (uint32_t*)uiBsx4 = (uiNnz32b3 | (uiNnz32b3 >> 8));
* (uint32_t*)uiBsx4 = DEBLOCK_BS_SHIFTED (uiNnz32b3);
nBS[0][1][3] = BS_EDGE (uiBsx4[0], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 13, 12);
nBS[0][2][3] = BS_EDGE (uiBsx4[1], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 14, 13);
nBS[0][3][3] = BS_EDGE (uiBsx4[2], iRefIndex, pCurDqLayer->pMv[LIST_0][iMbXy], 15, 14);

View File

@@ -164,24 +164,24 @@ void inline DeblockingBSInsideMBAvsbase (int8_t* pNnzTab, uint8_t uiBS[2][4][4],
uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
* (uint32_t*)uiBsx3 = (uiNnz32b0 | (uiNnz32b0 >> 8)) << iLShiftFactor;
* (uint32_t*)uiBsx3 = DEBLOCK_BS_SHIFTED (uiNnz32b0) << iLShiftFactor;
uiBS[0][1][0] = uiBsx3[0];
uiBS[0][2][0] = uiBsx3[1];
uiBS[0][3][0] = uiBsx3[2];
* (uint32_t*)uiBsx3 = (uiNnz32b1 | (uiNnz32b1 >> 8)) << iLShiftFactor;
* (uint32_t*)uiBsx3 = DEBLOCK_BS_SHIFTED (uiNnz32b1) << iLShiftFactor;
uiBS[0][1][1] = uiBsx3[0];
uiBS[0][2][1] = uiBsx3[1];
uiBS[0][3][1] = uiBsx3[2];
* (uint32_t*)uiBS[1][1] = (uiNnz32b0 | uiNnz32b1) << iLShiftFactor;
* (uint32_t*)uiBsx3 = (uiNnz32b2 | (uiNnz32b2 >> 8)) << iLShiftFactor;
* (uint32_t*)uiBsx3 = DEBLOCK_BS_SHIFTED (uiNnz32b2) << iLShiftFactor;
uiBS[0][1][2] = uiBsx3[0];
uiBS[0][2][2] = uiBsx3[1];
uiBS[0][3][2] = uiBsx3[2];
* (uint32_t*)uiBS[1][2] = (uiNnz32b1 | uiNnz32b2) << iLShiftFactor;
* (uint32_t*)uiBsx3 = (uiNnz32b3 | (uiNnz32b3 >> 8)) << iLShiftFactor;
* (uint32_t*)uiBsx3 = DEBLOCK_BS_SHIFTED (uiNnz32b3) << iLShiftFactor;
uiBS[0][1][3] = uiBsx3[0];
uiBS[0][2][3] = uiBsx3[1];
uiBS[0][3][3] = uiBsx3[2];
@@ -198,22 +198,22 @@ void inline DeblockingBSInsideMBNormal (SMB* pCurMb, uint8_t uiBS[2][4][4], int8
uiNnz32b2 = * (uint32_t*) (pNnzTab + 8);
uiNnz32b3 = * (uint32_t*) (pNnzTab + 12);
* (uint32_t*)uiBsx4 = (uiNnz32b0 | (uiNnz32b0 >> 8));
* (uint32_t*)uiBsx4 = DEBLOCK_BS_SHIFTED (uiNnz32b0);
uiBS[0][1][0] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 1, 0);
uiBS[0][2][0] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 2, 1);
uiBS[0][3][0] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 3, 2);
* (uint32_t*)uiBsx4 = (uiNnz32b1 | (uiNnz32b1 >> 8));
* (uint32_t*)uiBsx4 = DEBLOCK_BS_SHIFTED (uiNnz32b1);
uiBS[0][1][1] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 5, 4);
uiBS[0][2][1] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 6, 5);
uiBS[0][3][1] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 7, 6);
* (uint32_t*)uiBsx4 = (uiNnz32b2 | (uiNnz32b2 >> 8));
* (uint32_t*)uiBsx4 = DEBLOCK_BS_SHIFTED (uiNnz32b2);
uiBS[0][1][2] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 9, 8);
uiBS[0][2][2] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 10, 9);
uiBS[0][3][2] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 11, 10);
* (uint32_t*)uiBsx4 = (uiNnz32b3 | (uiNnz32b3 >> 8));
* (uint32_t*)uiBsx4 = DEBLOCK_BS_SHIFTED (uiNnz32b3);
uiBS[0][1][3] = BS_EDGE (uiBsx4[0], iRefIdx, pCurMb->sMv, 13, 12);
uiBS[0][2][3] = BS_EDGE (uiBsx4[1], iRefIdx, pCurMb->sMv, 14, 13);
uiBS[0][3][3] = BS_EDGE (uiBsx4[2], iRefIdx, pCurMb->sMv, 15, 14);