Merge pull request #1973 from huili2/sub8

modify some functions extending to sub8x8 usage, especially in ME part
This commit is contained in:
huili2 2015-06-02 14:44:06 +08:00
commit f76325edc7
14 changed files with 165 additions and 14 deletions

View File

@ -38,7 +38,9 @@
/****************************************************************************
* Copy functions
****************************************************************************/
void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
void WelsCopy4x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
void WelsCopy8x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
void WelsCopy4x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS);
void WelsCopy8x16_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS); //
void WelsCopy16x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS); //

View File

@ -41,8 +41,8 @@ int32_t WelsSampleSad16x16_c (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSad16x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSad8x16_c (uint8_t*, int32_t, uint8_t*, int32_t);
int32_t WelsSampleSad8x8_c (uint8_t*, int32_t, uint8_t*, int32_t);
//int32_t WelsSampleSad8x4( uint8_t *, int32_t, uint8_t *, int32_t );
//int32_t WelsSampleSad4x8( uint8_t *, int32_t, uint8_t *, int32_t );
int32_t WelsSampleSad8x4_c( uint8_t *, int32_t, uint8_t *, int32_t );
int32_t WelsSampleSad4x8_c( uint8_t *, int32_t, uint8_t *, int32_t );
int32_t WelsSampleSad4x4_c (uint8_t*, int32_t, uint8_t*, int32_t);
@ -52,6 +52,8 @@ void WelsSampleSadFour16x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSam
void WelsSampleSadFour8x16_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
void WelsSampleSadFour8x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
void WelsSampleSadFour4x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
void WelsSampleSadFour8x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
void WelsSampleSadFour4x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad);
#if defined(__cplusplus)
extern "C" {

View File

@ -45,7 +45,7 @@
/****************************************************************************
* Copy functions
****************************************************************************/
void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
void WelsCopy4x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
const int32_t kiSrcStride2 = iStrideS << 1;
const int32_t kiSrcStride3 = iStrideS + kiSrcStride2;
const int32_t kiDstStride2 = iStrideD << 1;
@ -56,6 +56,14 @@ void WelsCopy4x4 (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrid
ST32 (pDst + kiDstStride2, LD32 (pSrc + kiSrcStride2));
ST32 (pDst + kiDstStride3, LD32 (pSrc + kiSrcStride3));
}
void WelsCopy8x4_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
WelsCopy4x4_c (pDst, iStrideD, pSrc, iStrideS);
WelsCopy4x4_c (pDst + 4, iStrideD, pSrc + 4, iStrideS);
}
void WelsCopy4x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
WelsCopy4x4_c (pDst, iStrideD, pSrc, iStrideS);
WelsCopy4x4_c (pDst + (iStrideD << 2), iStrideD, pSrc + (iStrideS << 2), iStrideS);
}
void WelsCopy8x8_c (uint8_t* pDst, int32_t iStrideD, uint8_t* pSrc, int32_t iStrideS) {
int32_t i;
for (i = 0; i < 4; i++) {

View File

@ -59,6 +59,20 @@ int32_t WelsSampleSad4x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSampl
return iSadSum;
}
int32_t WelsSampleSad8x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
int32_t iSadSum = 0;
iSadSum += WelsSampleSad4x4_c (pSample1, iStride1, pSample2, iStride2);
iSadSum += WelsSampleSad4x4_c (pSample1 + 4, iStride1, pSample2 + 4, iStride2);
return iSadSum;
}
int32_t WelsSampleSad4x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
int32_t iSadSum = 0;
iSadSum += WelsSampleSad4x4_c (pSample1, iStride1, pSample2, iStride2);
iSadSum += WelsSampleSad4x4_c (pSample1 + (iStride1 << 2), iStride1, pSample2 + (iStride2 << 2), iStride2);
return iSadSum;
}
int32_t WelsSampleSad8x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
int32_t iSadSum = 0;
int32_t i = 0;
@ -137,3 +151,15 @@ void WelsSampleSadFour4x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSamp
* (pSad + 2) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 - 1), iStride2);
* (pSad + 3) = WelsSampleSad4x4_c (iSample1, iStride1, (iSample2 + 1), iStride2);
}
void WelsSampleSadFour8x4_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
* (pSad) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
* (pSad + 1) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
* (pSad + 2) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 - 1), iStride2);
* (pSad + 3) = WelsSampleSad8x4_c (iSample1, iStride1, (iSample2 + 1), iStride2);
}
void WelsSampleSadFour4x8_c (uint8_t* iSample1, int32_t iStride1, uint8_t* iSample2, int32_t iStride2, int32_t* pSad) {
* (pSad) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 - iStride2), iStride2);
* (pSad + 1) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 + iStride2), iStride2);
* (pSad + 2) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 - 1), iStride2);
* (pSad + 3) = WelsSampleSad4x8_c (iSample1, iStride1, (iSample2 + 1), iStride2);
}

View File

@ -79,7 +79,7 @@ ALIGNED_DECLARE (int8_t, iIntraPredMode[48], 16);
// must follow with iNonZeroCoeffCount!
int32_t iSadCost[4]; //avail 1; unavail 0
SMVUnitXY sMbMvp[MB_BLOCK8x8_NUM];// for write bs
SMVUnitXY sMbMvp[MB_BLOCK4x4_NUM];// for write bs
//for residual decoding (recovery) at the side of Encoder
int16_t* pCoeffLevel; // tmep

View File

@ -109,6 +109,9 @@ struct {
SWelsME sMe8x8[4];
SWelsME sMe16x8[2];
SWelsME sMe8x16[2];
SWelsME sMe4x4[4][4];
SWelsME sMe8x4[4][2];
SWelsME sMe4x8[4][2];
// SMVUnitXY i_mvbs[MB_BLOCK8x8_NUM]; //scaled MVB
} sMe;

View File

@ -142,9 +142,9 @@ BLOCK_16x8 = 1,
BLOCK_8x16 = 2,
BLOCK_8x8 = 3,
BLOCK_4x4 = 4,
// BLOCK_8x4 = 5,
// BLOCK_4x8 = 6,
BLOCK_SIZE_ALL = 5
BLOCK_8x4 = 5,
BLOCK_4x8 = 6,
BLOCK_SIZE_ALL = 7
};
typedef enum {

View File

@ -155,7 +155,7 @@ typedef void (*PCalculateBlockFeatureOfFrame) (uint8_t* pRef, const int32_t kiWi
typedef int32_t (*PCalculateSingleBlockFeature) (uint8_t* pRef, const int32_t kiRefStride);
typedef void (*PUpdateFMESwitch) (SDqLayer* pCurLayer);
#define MAX_BLOCK_TYPE 5 // prev 7
#define MAX_BLOCK_TYPE BLOCK_SIZE_ALL
typedef struct TagSampleDealingFunc {
PSampleSadSatdCostFunc pfSampleSad[MAX_BLOCK_TYPE];
PSampleSadSatdCostFunc pfSampleSatd[MAX_BLOCK_TYPE];
@ -235,8 +235,10 @@ struct TagWelsFuncPointerList {
PCopyFunc pfCopy8x8Aligned; //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c md.c
PCopyFunc pfCopy16x8NotAligned; //for MeRefineFracPixel 16x8 based
PCopyFunc pfCopy8x16Aligned; //for MeRefineFracPixel 8x16 based
PCopyFunc pfCopy4x4; //not sure if aligned or not, need further tune
PCopyFunc pfCopy8x4; //not sure if aligned or not, need further tune
PCopyFunc pfCopy4x8; //not sure if aligned or not, need further tune
//svc_encode_mb.c encode_mb_aux.c
PDctFunc pfDctT4;
PDctFunc pfDctFourT4;

View File

@ -467,7 +467,9 @@ void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->pfCopy16x16NotAligned = WelsCopy16x16_c;
pFuncList->pfCopy16x8NotAligned = WelsCopy16x8_c;
pFuncList->pfCopy8x16Aligned = WelsCopy8x16_c;
pFuncList->pfCopy4x4 = WelsCopy4x4_c;
pFuncList->pfCopy8x4 = WelsCopy8x4_c;
pFuncList->pfCopy4x8 = WelsCopy4x8_c;
pFuncList->pfQuantizationHadamard2x2 = WelsHadamardQuant2x2_c;
pFuncList->pfQuantizationHadamard2x2Skip = WelsHadamardQuant2x2Skip_c;
pFuncList->pfTransformHadamard4x4Dc = WelsHadamardT4Dc_c;

View File

@ -2982,7 +2982,9 @@ void PreprocessSliceCoding (sWelsEncCtx* pCtx) {
pFuncList->pfSearchMethod[BLOCK_16x8] =
pFuncList->pfSearchMethod[BLOCK_8x16] =
pFuncList->pfSearchMethod[BLOCK_8x8] =
pFuncList->pfSearchMethod[BLOCK_4x4] = WelsDiamondSearch;
pFuncList->pfSearchMethod[BLOCK_4x4] =
pFuncList->pfSearchMethod[BLOCK_8x4] =
pFuncList->pfSearchMethod[BLOCK_4x8] = WelsDiamondSearch;
pFuncList->pfFirstIntraMode = WelsMdFirstIntraMode;
pFuncList->sSampleDealingFuncs.pfMeCost = pCtx->pFuncList->sSampleDealingFuncs.pfSampleSatd;
pFuncList->pfSetScrollingMv = SetScrollingMvToMdNull;

View File

@ -95,6 +95,21 @@ int32_t WelsSampleSatd4x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSamp
return ((iSatdSum + 1) >> 1);
}
int32_t WelsSampleSatd8x4_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
int32_t iSatdSum = 0;
iSatdSum += WelsSampleSatd4x4_c (pSample1, iStride1, pSample2, iStride2);
iSatdSum += WelsSampleSatd4x4_c (pSample1 + 4, iStride1, pSample2 + 4, iStride2);
return iSatdSum;
}
int32_t WelsSampleSatd4x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
int32_t iSatdSum = 0;
iSatdSum += WelsSampleSatd4x4_c (pSample1, iStride1, pSample2, iStride2);
iSatdSum += WelsSampleSatd4x4_c (pSample1 + (iStride1 << 2), iStride1, pSample2 + (iStride2 << 2), iStride2);
return iSatdSum;
}
int32_t WelsSampleSatd8x8_c (uint8_t* pSample1, int32_t iStride1, uint8_t* pSample2, int32_t iStride2) {
int32_t iSatdSum = 0;
@ -325,6 +340,8 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x16 ] = WelsSampleSad8x16_c;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x8 ] = WelsSampleSad8x8_c;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x4 ] = WelsSampleSad4x4_c;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_8x4 ] = WelsSampleSad8x4_c;
pFuncList->sSampleDealingFuncs.pfSampleSad[BLOCK_4x8 ] = WelsSampleSad4x8_c;
//pfSampleSatd init
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_16x16] = WelsSampleSatd16x16_c;
@ -332,12 +349,16 @@ void WelsInitSampleSadFunc (SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag) {
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x16 ] = WelsSampleSatd8x16_c;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x8 ] = WelsSampleSatd8x8_c;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x4 ] = WelsSampleSatd4x4_c;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_8x4 ] = WelsSampleSatd8x4_c;
pFuncList->sSampleDealingFuncs.pfSampleSatd[BLOCK_4x8 ] = WelsSampleSatd4x8_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x16] = WelsSampleSadFour16x16_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_16x8] = WelsSampleSadFour16x8_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x16] = WelsSampleSadFour8x16_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x8] = WelsSampleSadFour8x8_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x4] = WelsSampleSadFour4x4_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_8x4] = WelsSampleSadFour8x4_c;
pFuncList->sSampleDealingFuncs.pfSample4Sad[BLOCK_4x8] = WelsSampleSadFour4x8_c;
pFuncList->sSampleDealingFuncs.pfIntra4x4Combined3Satd = NULL;
pFuncList->sSampleDealingFuncs.pfIntra8x8Combined3Satd = NULL;

View File

@ -174,7 +174,7 @@ void WelsEncRecI4x4Y (sWelsEncCtx* pEncCtx, SMB* pCurMb, SMbCache* pMbCache, uin
pFuncList->pfDequantization4x4 (pResI4x4, g_kuiDequantCoeff[uiQp]);
pFuncList->pfIDctT4 (pPredI4x4, iRecStride, pBestPred, 4, pResI4x4);
} else
WelsCopy4x4 (pPredI4x4, iRecStride, pBestPred, 4);
pFuncList->pfCopy4x4 (pPredI4x4, iRecStride, pBestPred, 4);
}
void WelsEncInterY (SWelsFuncPtrList* pFuncList, SMB* pCurMb, SMbCache* pMbCache) {

View File

@ -241,7 +241,9 @@ TEST(EncodeMbAuxTest, function) { \
EXPECT_EQ(ref_dst[i*iDStride+j], dst[i*iDStride+j]); \
}
GENERATE_UT_FOR_COPY (4, 4, WelsCopy4x4);
GENERATE_UT_FOR_COPY (4, 4, WelsCopy4x4_c);
GENERATE_UT_FOR_COPY (8, 4, WelsCopy8x4_c);
GENERATE_UT_FOR_COPY (4, 8, WelsCopy4x8_c);
GENERATE_UT_FOR_COPY (8, 8, WelsCopy8x8_c);
GENERATE_UT_FOR_COPY (8, 16, WelsCopy8x16_c);
GENERATE_UT_FOR_COPY (16, 8, WelsCopy16x8_c);

View File

@ -188,6 +188,42 @@ TEST_F (SadSatdCFuncTest, WelsSampleSad4x4_c) {
EXPECT_EQ (WelsSampleSad4x4_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB, m_iStrideB), iSumSad);
}
TEST_F (SadSatdCFuncTest, WelsSampleSad8x4_c) {
for (int i = 0; i < (m_iStrideA << 2); i++)
m_pPixSrcA[i] = rand() % 256;
for (int i = 0; i < (m_iStrideB << 2); i++)
m_pPixSrcB[i] = rand() % 256;
uint8_t* pPixA = m_pPixSrcA;
uint8_t* pPixB = m_pPixSrcB;
int32_t iSumSad = 0;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 8; j++)
iSumSad += abs (pPixA[j] - pPixB[j]);
pPixA += m_iStrideA;
pPixB += m_iStrideB;
}
EXPECT_EQ (WelsSampleSad8x4_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB, m_iStrideB), iSumSad);
}
TEST_F (SadSatdCFuncTest, WelsSampleSad4x8_c) {
for (int i = 0; i < (m_iStrideA << 2); i++)
m_pPixSrcA[i] = rand() % 256;
for (int i = 0; i < (m_iStrideB << 2); i++)
m_pPixSrcB[i] = rand() % 256;
uint8_t* pPixA = m_pPixSrcA;
uint8_t* pPixB = m_pPixSrcB;
int32_t iSumSad = 0;
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 4; j++)
iSumSad += abs (pPixA[j] - pPixB[j]);
pPixA += m_iStrideA;
pPixB += m_iStrideB;
}
EXPECT_EQ (WelsSampleSad4x8_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB, m_iStrideB), iSumSad);
}
TEST_F (SadSatdCFuncTest, WelsSampleSad8x8_c) {
for (int i = 0; i < (m_iStrideA << 3); i++)
m_pPixSrcA[i] = rand() % 256;
@ -444,6 +480,51 @@ TEST_F (SadSatdCFuncTest, WelsSampleSadFour4x4_c) {
EXPECT_EQ (m_pSad[0] + m_pSad[1] + m_pSad[2] + m_pSad[3], iSumSad);
}
TEST_F (SadSatdCFuncTest, WelsSampleSadFour8x4_c) {
for (int i = 0; i < (m_iStrideA << 3); i++)
m_pPixSrcA[i] = rand() % 256;
for (int i = 0; i < (m_iStrideB << 3); i++)
m_pPixSrcB[i] = rand() % 256;
uint8_t* pPixA = m_pPixSrcA;
uint8_t* pPixB = m_pPixSrcB + m_iStrideB;
int32_t iSumSad = 0;
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 8; j++) {
iSumSad += abs (pPixA[j] - pPixB[j - 1]);
iSumSad += abs (pPixA[j] - pPixB[j + 1]);
iSumSad += abs (pPixA[j] - pPixB[j - m_iStrideB]);
iSumSad += abs (pPixA[j] - pPixB[j + m_iStrideB]);
}
pPixA += m_iStrideA;
pPixB += m_iStrideB;
}
WelsSampleSadFour8x4_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB + m_iStrideB, m_iStrideB, m_pSad);
EXPECT_EQ (m_pSad[0] + m_pSad[1] + m_pSad[2] + m_pSad[3], iSumSad);
}
TEST_F (SadSatdCFuncTest, WelsSampleSadFour4x8_c) {
for (int i = 0; i < (m_iStrideA << 3); i++)
m_pPixSrcA[i] = rand() % 256;
for (int i = 0; i < (m_iStrideB << 3); i++)
m_pPixSrcB[i] = rand() % 256;
uint8_t* pPixA = m_pPixSrcA;
uint8_t* pPixB = m_pPixSrcB + m_iStrideB;
int32_t iSumSad = 0;
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 4; j++) {
iSumSad += abs (pPixA[j] - pPixB[j - 1]);
iSumSad += abs (pPixA[j] - pPixB[j + 1]);
iSumSad += abs (pPixA[j] - pPixB[j - m_iStrideB]);
iSumSad += abs (pPixA[j] - pPixB[j + m_iStrideB]);
}
pPixA += m_iStrideA;
pPixB += m_iStrideB;
}
WelsSampleSadFour4x8_c (m_pPixSrcA, m_iStrideA, m_pPixSrcB + m_iStrideB, m_iStrideB, m_pSad);
EXPECT_EQ (m_pSad[0] + m_pSad[1] + m_pSad[2] + m_pSad[3], iSumSad);
}
class SadSatdAssemblyFuncTest : public testing::Test {
public: