[Encoder] CABAC optimizations

~2.4x speedup (time attributed to all CABAC-related fuctions) on x86
(Ivy Bridge) with GCC version 4.9.2 (Debian 4.9.2-10).

~1.3x overall faster encode on a quick 720p30 6Mbps test.

Reviewed at https://rbcommons.com/s/OpenH264/r/1347/
This commit is contained in:
Sindre Aamås 2015-10-12 17:59:08 +02:00
parent b700b67bba
commit ed133d4c3d
3 changed files with 163 additions and 151 deletions

View File

@ -50,29 +50,33 @@ namespace WelsEnc {
#define WELS_QP_MAX 51
typedef uint64_t cabac_low_t;
enum { CABAC_LOW_WIDTH = sizeof (cabac_low_t) / sizeof (uint8_t) * 8 };
typedef struct TagStateCtx {
uint8_t m_uiState;
uint8_t m_uiValMps;
// Packed representation of state and MPS as state << 1 | MPS.
uint8_t m_uiStateMps;
uint8_t Mps() const { return m_uiStateMps & 1; }
uint8_t State() const { return m_uiStateMps >> 1; }
void Set (uint8_t uiState, uint8_t uiMps) { m_uiStateMps = uiState * 2 + uiMps; }
} SStateCtx;
typedef struct TagCabacCtx {
uint32_t m_uiLow;
cabac_low_t m_uiLow;
int32_t m_iLowBitCnt;
int32_t m_iRenormCnt;
uint32_t m_uiRange;
SStateCtx m_sStateCtx[WELS_CONTEXT_COUNT];
uint8_t* m_pBufStart;
uint8_t* m_pBufEnd;
uint8_t* m_pBufCur;
uint8_t m_iBitsOutstanding;
uint32_t m_uData;
uint32_t m_uiBitsUsed;
uint32_t m_iFirstFlag;
uint32_t m_uiBinCountsInNalUnits;
} SCabacCtx;
void WelsCabacContextInit (void* pCtx, SCabacCtx* pCbCtx, int32_t iModel);
void WelsCabacEncodeInit (SCabacCtx* pCbCtx, uint8_t* pBuf, uint8_t* pEnd);
void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin);
void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin);
inline void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin);
inline void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin);
void WelsCabacEncodeTerminate (SCabacCtx* pCbCtx, uint32_t uiBin);
void WelsCabacEncodeUeBypass (SCabacCtx* pCbCtx, int32_t iExpBits, uint32_t uiVal);
void WelsCabacEncodeFlush (SCabacCtx* pCbCtx);
@ -81,5 +85,43 @@ int32_t WriteBlockResidualCabac (void* pEncCtx, int16_t* pCoffLevel, int32_t i
int32_t iCalRunLevelFlag,
int32_t iResidualProperty, int8_t iNC, SBitStringAux* pBs);
// private functions used by public inline functions.
void WelsCabacEncodeDecisionLps_ (SCabacCtx* pCbCtx, int32_t iCtx);
void WelsCabacEncodeUpdateLowNontrivial_ (SCabacCtx* pCbCtx);
inline void WelsCabacEncodeUpdateLow_ (SCabacCtx* pCbCtx) {
if (pCbCtx->m_iLowBitCnt + pCbCtx->m_iRenormCnt < CABAC_LOW_WIDTH) {
pCbCtx->m_iLowBitCnt += pCbCtx->m_iRenormCnt;
pCbCtx->m_uiLow <<= pCbCtx->m_iRenormCnt;
} else {
WelsCabacEncodeUpdateLowNontrivial_ (pCbCtx);
}
pCbCtx->m_iRenormCnt = 0;
}
// inline function definitions.
void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin) {
if (uiBin == pCbCtx->m_sStateCtx[iCtx].Mps()) {
const int32_t kiState = pCbCtx->m_sStateCtx[iCtx].State();
uint32_t uiRange = pCbCtx->m_uiRange;
uint32_t uiRangeLps = g_kuiCabacRangeLps[kiState][(uiRange & 0xff) >> 6];
uiRange -= uiRangeLps;
const int32_t kiRenormAmount = uiRange >> 8 ^ 1;
pCbCtx->m_uiRange = uiRange << kiRenormAmount;
pCbCtx->m_iRenormCnt += kiRenormAmount;
pCbCtx->m_sStateCtx[iCtx].Set (g_kuiStateTransTable[kiState][1], uiBin);
} else {
WelsCabacEncodeDecisionLps_ (pCbCtx, iCtx);
}
}
void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin) {
const uint32_t kuiBinBitmask = -uiBin;
pCbCtx->m_iRenormCnt++;
WelsCabacEncodeUpdateLow_ (pCbCtx);
pCbCtx->m_uiLow += kuiBinBitmask & pCbCtx->m_uiRange;
}
}
#endif

View File

@ -42,10 +42,25 @@
#include "macros.h"
#include "set_mb_syn_cabac.h"
#include "encoder.h"
#include "golomb_common.h"
namespace {
const int8_t g_kiClz5Table[32] = {
6, 5, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
};
void PropagateCarry (uint8_t* pBufCur, uint8_t* pBufStart) {
for (; pBufCur > pBufStart; --pBufCur)
if (++*(pBufCur - 1))
break;
}
} // anon ns.
namespace WelsEnc {
void WelsCabacInit (void* pCtx) {
sWelsEncCtx* pEncCtx = (sWelsEncCtx*)pCtx;
for (int32_t iModel = 0; iModel < 4; iModel++) {
@ -63,8 +78,7 @@ void WelsCabacInit (void* pCtx) {
uiStateIdx = iPreCtxState - 64;
uiValMps = 1;
}
pEncCtx->sWelsCabacContexts[iModel][iQp][iIdx].m_uiState = uiStateIdx;
pEncCtx->sWelsCabacContexts[iModel][iQp][iIdx].m_uiValMps = uiValMps;
pEncCtx->sWelsCabacContexts[iModel][iQp][iIdx].Set (uiStateIdx, uiValMps);
}
}
}
@ -79,121 +93,76 @@ void WelsCabacContextInit (void* pCtx, SCabacCtx* pCbCtx, int32_t iModel) {
void WelsCabacEncodeInit (SCabacCtx* pCbCtx, uint8_t* pBuf, uint8_t* pEnd) {
pCbCtx->m_uiLow = 0;
pCbCtx->m_iLowBitCnt = 9;
pCbCtx->m_iRenormCnt = 0;
pCbCtx->m_uiRange = 510;
pCbCtx->m_iBitsOutstanding = 0;
pCbCtx->m_uData = 0;
pCbCtx->m_uiBitsUsed = 0;
pCbCtx->m_iFirstFlag = 1;
pCbCtx->m_pBufStart = pBuf;
pCbCtx->m_pBufEnd = pEnd;
pCbCtx->m_pBufCur = pBuf;
pCbCtx->m_uiBinCountsInNalUnits = 0;
}
void WelsCabacPutBit (SCabacCtx* pCbCtx, uint32_t iValue) {
if (pCbCtx->m_iFirstFlag != 0) {
pCbCtx->m_iFirstFlag = 0;
} else {
pCbCtx->m_uData = (pCbCtx->m_uData << 1) | iValue;
pCbCtx->m_uiBitsUsed++;
}
if (pCbCtx->m_iBitsOutstanding == 0) {
while (pCbCtx->m_uiBitsUsed >= 8) {
pCbCtx->m_uiBitsUsed -= 8;
uint32_t uiByte = pCbCtx->m_uData >> (pCbCtx->m_uiBitsUsed);
if (pCbCtx->m_uiBitsUsed == 0)
pCbCtx->m_uData = 0;
else
pCbCtx->m_uData &= (uint32_t) ((0xFFFFFFFF) >> (32 - pCbCtx->m_uiBitsUsed));
*pCbCtx->m_pBufCur ++ = uiByte;
}
} else {
void WelsCabacEncodeUpdateLowNontrivial_ (SCabacCtx* pCbCtx) {
int32_t iLowBitCnt = pCbCtx->m_iLowBitCnt;
int32_t iRenormCnt = pCbCtx->m_iRenormCnt;
cabac_low_t uiLow = pCbCtx->m_uiLow;
while (pCbCtx->m_iBitsOutstanding > 0) {
pCbCtx->m_uData = (pCbCtx->m_uData << 1) | (1 - iValue);
pCbCtx->m_iBitsOutstanding--;
pCbCtx->m_uiBitsUsed++;
while (pCbCtx->m_uiBitsUsed >= 8) {
pCbCtx->m_uiBitsUsed -= 8;
uint32_t uiByte = pCbCtx->m_uData >> (pCbCtx->m_uiBitsUsed);
if (pCbCtx->m_uiBitsUsed == 0)
pCbCtx->m_uData = 0;
else
pCbCtx->m_uData &= (uint32_t) ((0xFFFFFFFF) >> (32 - pCbCtx->m_uiBitsUsed));
*pCbCtx->m_pBufCur ++ = uiByte;
}
}
}
}
void WelsCabacEncodeRenorm (SCabacCtx* pCbCtx) {
while (pCbCtx->m_uiRange < 256) {
if (pCbCtx->m_uiLow < 256) {
WelsCabacPutBit (pCbCtx, 0);
} else {
if (pCbCtx->m_uiLow >= 512) {
pCbCtx->m_uiLow -= 512;
WelsCabacPutBit (pCbCtx, 1);
} else {
pCbCtx->m_uiLow -= 256;
pCbCtx->m_iBitsOutstanding++;
}
}
pCbCtx->m_uiRange <<= 1;
pCbCtx->m_uiLow <<= 1;
}
}
void WelsCabacEncodeDecision (SCabacCtx* pCbCtx, int32_t iCtx, uint32_t uiBin) {
uint8_t uiState = pCbCtx->m_sStateCtx[iCtx].m_uiState;
uint8_t uiValMps = pCbCtx->m_sStateCtx[iCtx].m_uiValMps;
uint32_t uiRangeLps = g_kuiCabacRangeLps[uiState][ (pCbCtx->m_uiRange >> 6) & 3];
do {
uint8_t* pBufCur = pCbCtx->m_pBufCur;
const int32_t kiInc = CABAC_LOW_WIDTH - 1 - iLowBitCnt;
pCbCtx->m_uiRange -= uiRangeLps;
if (uiBin != uiValMps) { //LPS
pCbCtx->m_uiLow += pCbCtx->m_uiRange;
pCbCtx->m_uiRange = uiRangeLps;
if (uiState == 0)
uiValMps = 1 - uiValMps;
pCbCtx->m_sStateCtx[iCtx].m_uiState = g_kuiStateTransTable[uiState][0];
pCbCtx->m_sStateCtx[iCtx].m_uiValMps = uiValMps;
} else {
pCbCtx->m_sStateCtx[iCtx].m_uiState = g_kuiStateTransTable[uiState][1];
}
WelsCabacEncodeRenorm (pCbCtx);
pCbCtx->m_uiBinCountsInNalUnits++;
uiLow <<= kiInc;
if (uiLow & cabac_low_t (1) << (CABAC_LOW_WIDTH - 1))
PropagateCarry (pBufCur, pCbCtx->m_pBufStart);
if (CABAC_LOW_WIDTH > 32) {
WRITE_BE_32 (pBufCur, uiLow >> 31);
pBufCur += 4;
}
*pBufCur++ = uiLow >> 23;
*pBufCur++ = uiLow >> 15;
iRenormCnt -= kiInc;
iLowBitCnt = 15;
uiLow &= (1u << iLowBitCnt) - 1;
pCbCtx->m_pBufCur = pBufCur;
} while (iLowBitCnt + iRenormCnt > CABAC_LOW_WIDTH - 1);
pCbCtx->m_iLowBitCnt = iLowBitCnt + iRenormCnt;
pCbCtx->m_uiLow = uiLow << iRenormCnt;
}
void WelsCabacEncodeBypassOne (SCabacCtx* pCbCtx, uint32_t uiBin) {
pCbCtx->m_uiLow <<= 1;
if (uiBin) {
pCbCtx->m_uiLow += pCbCtx->m_uiRange;
}
if (pCbCtx->m_uiLow >= 1024) {
WelsCabacPutBit (pCbCtx, 1);
pCbCtx->m_uiLow -= 1024;
} else {
if (pCbCtx->m_uiLow < 512)
WelsCabacPutBit (pCbCtx, 0);
else {
pCbCtx->m_uiLow -= 512;
pCbCtx->m_iBitsOutstanding++;
}
}
pCbCtx->m_uiBinCountsInNalUnits++;
void WelsCabacEncodeDecisionLps_ (SCabacCtx* pCbCtx, int32_t iCtx) {
const int32_t kiState = pCbCtx->m_sStateCtx[iCtx].State();
uint32_t uiRange = pCbCtx->m_uiRange;
uint32_t uiRangeLps = g_kuiCabacRangeLps[kiState][(uiRange & 0xff) >> 6];
uiRange -= uiRangeLps;
pCbCtx->m_sStateCtx[iCtx].Set (g_kuiStateTransTable[kiState][0],
pCbCtx->m_sStateCtx[iCtx].Mps() ^ (kiState == 0));
WelsCabacEncodeUpdateLow_ (pCbCtx);
pCbCtx->m_uiLow += uiRange;
const int32_t kiRenormAmount = g_kiClz5Table[uiRangeLps >> 3];
pCbCtx->m_uiRange = uiRangeLps << kiRenormAmount;
pCbCtx->m_iRenormCnt = kiRenormAmount;
}
void WelsCabacEncodeTerminate (SCabacCtx* pCbCtx, uint32_t uiBin) {
pCbCtx->m_uiRange -= 2;
if (uiBin) {
WelsCabacEncodeUpdateLow_ (pCbCtx);
pCbCtx->m_uiLow += pCbCtx->m_uiRange;
pCbCtx->m_uiRange = 2;
WelsCabacEncodeRenorm (pCbCtx);
WelsCabacPutBit (pCbCtx, ((pCbCtx->m_uiLow >> 9) & 1));
int32_t iLastTwoBits = (((pCbCtx->m_uiLow >> 7) & 3) | 1);
pCbCtx->m_uData = (pCbCtx->m_uData << 2) | iLastTwoBits;
pCbCtx->m_uiBitsUsed += 2;
const int32_t kiRenormAmount = 7;
pCbCtx->m_uiRange = 2 << kiRenormAmount;
pCbCtx->m_iRenormCnt = kiRenormAmount;
WelsCabacEncodeUpdateLow_ (pCbCtx);
pCbCtx->m_uiLow |= 0x80;
} else {
WelsCabacEncodeRenorm (pCbCtx);
const int32_t kiRenormAmount = pCbCtx->m_uiRange >> 8 ^ 1;
pCbCtx->m_uiRange = pCbCtx->m_uiRange << kiRenormAmount;
pCbCtx->m_iRenormCnt += kiRenormAmount;
}
pCbCtx->m_uiBinCountsInNalUnits++;
}
void WelsCabacEncodeUeBypass (SCabacCtx* pCbCtx, int32_t iExpBits, uint32_t uiVal) {
int32_t iSufS = uiVal;
@ -215,22 +184,18 @@ void WelsCabacEncodeUeBypass (SCabacCtx* pCbCtx, int32_t iExpBits, uint32_t uiVa
void WelsCabacEncodeFlush (SCabacCtx* pCbCtx) {
WelsCabacEncodeTerminate (pCbCtx, 1);
while (pCbCtx->m_uiBitsUsed > 0) {
if (pCbCtx->m_uiBitsUsed > 8) {
pCbCtx->m_uiBitsUsed -= 8;
uint32_t uiByte = pCbCtx->m_uData >> (pCbCtx->m_uiBitsUsed);
pCbCtx->m_uData &= (uint32_t) ((0xFFFFFFFF) >> (32 - pCbCtx->m_uiBitsUsed));
*pCbCtx->m_pBufCur ++ = uiByte;
} else {
if (pCbCtx->m_uiBitsUsed == 8) {
*pCbCtx->m_pBufCur ++ = pCbCtx->m_uData & 0xff;
} else {
*pCbCtx->m_pBufCur ++ = (pCbCtx->m_uData << (8 - pCbCtx->m_uiBitsUsed));
}
pCbCtx->m_uiBitsUsed = 0;
}
}
cabac_low_t uiLow = pCbCtx->m_uiLow;
int32_t iLowBitCnt = pCbCtx->m_iLowBitCnt;
uint8_t* pBufCur = pCbCtx->m_pBufCur;
uiLow <<= CABAC_LOW_WIDTH - 1 - iLowBitCnt;
if (uiLow & cabac_low_t (1) << (CABAC_LOW_WIDTH - 1))
PropagateCarry (pBufCur, pCbCtx->m_pBufStart);
for (; (iLowBitCnt -= 8) >= 0; uiLow <<= 8)
*pBufCur++ = uiLow >> (CABAC_LOW_WIDTH - 9);
pCbCtx->m_pBufCur = pBufCur;
}
uint8_t* WelsCabacEncodeGetPtr (SCabacCtx* pCbCtx) {

View File

@ -41,7 +41,9 @@
#include "set_mb_syn_cabac.h"
#include "svc_enc_golomb.h"
namespace WelsEnc {
using namespace WelsEnc;
namespace {
static const uint16_t uiSignificantCoeffFlagOffset[5] = {0, 15, 29, 44, 47};
static const uint16_t uiLastCoeffFlagOffset[5] = {0, 15, 29, 44, 47};
@ -455,21 +457,17 @@ void WelsWriteBlockResidualCabac (SMbCache* pMbCache, SMB* pCurMb, uint32_t iMb
ECtxBlockCat eCtxBlockCat, int16_t iIdx, int16_t iNonZeroCount, int16_t* pBlock, int16_t iEndIdx) {
int32_t iCtx = WelsGetMbCtxCabac (pMbCache, pCurMb, iMbWidth, eCtxBlockCat, iIdx);
if (iNonZeroCount) {
ENFORCE_STACK_ALIGN_1D (int16_t, iAbsLevel, 16, 16);
ENFORCE_STACK_ALIGN_1D (int16_t, iSignLevel, 16, 16);
int16_t iLevel[16];
const int32_t iCtxSig = 105 + uiSignificantCoeffFlagOffset[eCtxBlockCat];
const int32_t iCtxLast = 166 + uiLastCoeffFlagOffset[eCtxBlockCat];
const int32_t iCtxLevel = 227 + uiCoeffAbsLevelMinus1Offset[eCtxBlockCat];
int32_t iNonZeroIdx = 0;
int32_t i = 0;
int32_t iNumAbsLevelEq1 = 0;
int32_t iNumAbsLevelGt1 = 0;
WelsCabacEncodeDecision (pCabacCtx, iCtx, 1);
while (1) {
if (pBlock[i]) {
iSignLevel[iNonZeroIdx] = pBlock[i] < 0;
iAbsLevel[iNonZeroIdx] = WELS_ABS (pBlock[i]) - 1;
iLevel[iNonZeroIdx] = pBlock[i];
iNonZeroIdx++;
WelsCabacEncodeDecision (pCabacCtx, iCtxSig + i, 1);
@ -483,33 +481,38 @@ void WelsWriteBlockResidualCabac (SMbCache* pMbCache, SMB* pCurMb, uint32_t iMb
WelsCabacEncodeDecision (pCabacCtx, iCtxSig + i, 0);
i++;
if (i == iEndIdx) {
iSignLevel[iNonZeroIdx] = pBlock[i] < 0;
iAbsLevel[iNonZeroIdx] = WELS_ABS (pBlock[i]) - 1;
iLevel[iNonZeroIdx] = pBlock[i];
iNonZeroIdx++;
break;
}
}
int32_t iNumAbsLevelGt1 = 0;
int32_t iCtx1 = iCtxLevel + 1;
do {
int32_t iPrefix = 0;
iNonZeroIdx--;
iPrefix = WELS_MIN (iAbsLevel[iNonZeroIdx], 14);
iPrefix = WELS_ABS (iLevel[iNonZeroIdx]) - 1;
if (iPrefix) {
iCtx = iCtxLevel + ((iNumAbsLevelGt1 != 0) ? 0 : WELS_MIN (4, 1 + iNumAbsLevelEq1));
iPrefix = WELS_MIN (iPrefix, 14);
iCtx = WELS_MIN (iCtxLevel + 4, iCtx1);
WelsCabacEncodeDecision (pCabacCtx, iCtx, 1);
iCtx = iCtxLevel + 5 + WELS_MIN (4 - (eCtxBlockCat == CHROMA_DC), iNumAbsLevelGt1);
for (i = 0; i < iPrefix - 1; i++)
iNumAbsLevelGt1++;
iCtx = iCtxLevel + 4 + WELS_MIN (5 - (eCtxBlockCat == CHROMA_DC), iNumAbsLevelGt1);
for (i = 1; i < iPrefix; i++)
WelsCabacEncodeDecision (pCabacCtx, iCtx, 1);
if (iPrefix < 14)
if (WELS_ABS (iLevel[iNonZeroIdx]) < 15)
WelsCabacEncodeDecision (pCabacCtx, iCtx, 0);
else
WelsCabacEncodeUeBypass (pCabacCtx, 0, iAbsLevel[iNonZeroIdx] - 14);
iNumAbsLevelGt1++;
WelsCabacEncodeUeBypass (pCabacCtx, 0, WELS_ABS (iLevel[iNonZeroIdx]) - 15);
iCtx1 = iCtxLevel;
} else {
iCtx = iCtxLevel + ((iNumAbsLevelGt1 != 0) ? 0 : WELS_MIN (4, 1 + iNumAbsLevelEq1));
iCtx = WELS_MIN (iCtxLevel + 4, iCtx1);
WelsCabacEncodeDecision (pCabacCtx, iCtx, 0);
iNumAbsLevelEq1++;
iCtx1 += iNumAbsLevelGt1 == 0;
}
WelsCabacEncodeBypassOne (pCabacCtx, iSignLevel[iNonZeroIdx]);
WelsCabacEncodeBypassOne (pCabacCtx, iLevel[iNonZeroIdx] < 0);
} while (iNonZeroIdx > 0);
} else {
@ -519,12 +522,10 @@ void WelsWriteBlockResidualCabac (SMbCache* pMbCache, SMB* pCurMb, uint32_t iMb
}
int32_t WelsCalNonZeroCount2x2Block (int16_t* pBlock) {
int32_t iCount = 0;
for (int16_t i = 0; i < 4; i++) {
if (pBlock[i])
iCount++;
}
return iCount;
return (pBlock[0] != 0)
+ (pBlock[1] != 0)
+ (pBlock[2] != 0)
+ (pBlock[3] != 0);
}
int32_t WelsWriteMbResidualCabac (SWelsFuncPtrList* pFuncList, SSlice* pSlice, SMbCache* sMbCacheInfo, SMB* pCurMb,
SCabacCtx* pCabacCtx,
@ -618,6 +619,10 @@ int32_t WelsWriteMbResidualCabac (SWelsFuncPtrList* pFuncList, SSlice* pSlice, S
return 0;
}
} // anon ns.
namespace WelsEnc {
void WelsInitSliceCabac (sWelsEncCtx* pEncCtx, SSlice* pSlice) {
/* alignment needed */
SBitStringAux* pBs = pSlice->pSliceBsa;