Add ASM related functions for ME cross search

Add asm level functions

Add asm code for ME

Modify format

Add unit test for asm code.

Modify function name and format.

Remove unuse comment

Modify targets file

Add Macro protect for SSE41 funtion test

Modify according to review request.
This commit is contained in:
Licai Guo 2014-03-28 10:22:11 +08:00
parent 94cabe10d5
commit 5c60e8f868
10 changed files with 980 additions and 14 deletions

View File

@ -81,6 +81,7 @@
4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };
4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@ -281,6 +282,7 @@
4CE446A918BC605C0017DF25 /* inc */ = {
isa = PBXGroup;
children = (
4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */,
4CE446AA18BC605C0017DF25 /* as264_common.h */,
4CE446AB18BC605C0017DF25 /* au_set.h */,
4CE446AC18BC605C0017DF25 /* bit_stream.h */,

View File

@ -199,11 +199,24 @@ void LineFullSearch_c( void *pFunc, void *vpMe,
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch );
#ifdef X86_ASM
extern "C"
{
uint32_t SampleSad8x8Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
uint32_t SampleSad16x16Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
}
void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch );
void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch );
#endif
void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList, SDqLayer* pCurLayer, SWelsME * pMe, const SSlice* pSlice);
// Feature Search Basics

View File

@ -87,6 +87,7 @@
#define PARA_SET_TYPE_SUBSETSPS 1
#define PARA_SET_TYPE_PPS 2
#define MAX_VERTICAL_MV_RANGE 1024 //TODO, for allocate enough memory for transpose
#define MAX_FRAME_RATE 30 // maximal frame rate to support
#define MIN_FRAME_RATE 1 // minimal frame rate need support

View File

@ -134,6 +134,7 @@ typedef int32_t (*PIntraPred16x16Combined3Func) (uint8_t*, int32_t, uint8_t*, in
typedef int32_t (*PIntraPred8x8Combined3Func) (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
uint8_t*, uint8_t*);
typedef uint32_t (*PSampleSadHor8Func)( uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t* );
typedef void (*PMotionSearchFunc) (SWelsFuncPtrList* pFuncList, void* pCurDqLayer, void* pMe,
void* pSlice);
typedef void (*PSearchMethodFunc) (SWelsFuncPtrList* pFuncList, void* pMe, void* pSlice, const int32_t kiEncStride, const int32_t kiRefStride);
@ -202,14 +203,16 @@ struct TagWelsFuncPointerList {
PGetIntraPredFunc pfGetLumaI4x4Pred[I4_PRED_A];
PGetIntraPredFunc pfGetChromaPred[C_PRED_A];
PSampleSadHor8Func pfSampleSadHor8[2]; // 0: for 16x16 square; 1: for 8x8 square
PMotionSearchFunc
pfMotionSearch[BLOCK_STATIC_IDC_ALL]; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c
PSearchMethodFunc pfSearchMethod[BLOCK_SIZE_ALL];
PCalculateSatdFunc pfCalculateSatd;
PCheckDirectionalMv pfCheckDirectionalMv;
PLineFullSearchFunc pfLineFullSearch;
PCalculateBlockFeatureOfFrame pfCalculateBlockFeatureOfFrame[2];//0 - for 8x8, 1 for 16x16
PCalculateSingleBlockFeature pfCalculateSingleBlockFeature[2];//0 - for 8x8, 1 for 16x16
PLineFullSearchFunc pfVerticalFullSearch;
PLineFullSearchFunc pfHorizontalFullSearch;
PCopyFunc pfCopy16x16Aligned; //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
PCopyFunc pfCopy16x16NotAligned; //md.c

View File

@ -0,0 +1,54 @@
/*!
* \copy
* Copyright (c) 2013, Cisco Systems
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifndef WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
#define WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
#include "typedefs.h"
namespace WelsSVCEnc {
#ifdef X86_ASM
extern "C"
{
void TransposeMatrixBlocksx16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
void TransposeMatrixBlock16x16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
void TransposeMatrixBlocksx8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
void TransposeMatrixBlock8x8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
}
#endif
typedef void (*PTransposeMatrixBlockFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
typedef void (*PTransposeMatrixBlocksFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
}// end of namespace declaration
#endif//WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__

View File

@ -41,6 +41,7 @@
#include "cpu_core.h"
#include "ls_defines.h"
#include "svc_motion_estimate.h"
#include "wels_transpose_matrix.h"
namespace WelsSVCEnc {
@ -65,8 +66,14 @@ void WelsInitMeFunc( SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
pFuncList->pfCheckDirectionalMv = CheckDirectionalMv;
//for cross serarch
pFuncList->pfLineFullSearch = LineFullSearch_c;
pFuncList->pfVerticalFullSearch = LineFullSearch_c;
pFuncList->pfHorizontalFullSearch = LineFullSearch_c;
#if defined (X86_ASM)
if ( uiCpuFlag & WELS_CPU_SSE41 ) {
pFuncList->pfSampleSadHor8[0] = SampleSad8x8Hor8_sse41;
pFuncList->pfSampleSadHor8[1] = SampleSad16x16Hor8_sse41;
pFuncList->pfVerticalFullSearch = VerticalFullSearchUsingSSE41;
pFuncList->pfHorizontalFullSearch = HorizontalFullSearchUsingSSE41;
}
//for feature search
@ -75,6 +82,7 @@ void WelsInitMeFunc( SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
//TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
#endif
}
}
@ -302,18 +310,147 @@ bool CheckDirectionalMvFalse(PSampleSadSatdCostFunc pSad, void * vpMe,
/////////////////////////
// Cross Search Basics
/////////////////////////
#if defined (X86_ASM)
void CalcMvdCostx8_c( uint16_t *pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost )
{
uint16_t *pBaseCost = pMvdCost;
const int32_t kiOffset = (kiStartMv<<2);
uint16_t *pMvd = pMvdTable+kiOffset;
for (int32_t i = 0; i < 8; ++ i) {
pBaseCost[i] = ((*pMvd) + kiFixedCost);
pMvd += 4;
}
}
void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch ) {
SWelsFuncPtrList *pFuncList = static_cast<SWelsFuncPtrList *>(pFunc);
SWelsME *pMe = static_cast<SWelsME *>(vpMe);
SWelsME *pMe = static_cast<SWelsME *>(vpMe);
uint8_t* kpEncMb = pMe->pEncMb;
const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixY;
uint8_t* pRef = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiRefStride];
const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
const int32_t kiEdgeBlocks = kIsBlock16x16 ? 16 : 8;
PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
PTransposeMatrixBlockFunc TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx;
PTransposeMatrixBlocksFunc TransposeMatrixBlocks= kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx;
const int32_t kiDiff = kiMaxPos - kiMinPos;
const int32_t kiRowNum = WELS_ALIGN((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);
const int32_t kiBlocksNum = kIsBlock16x16 ? (kiRowNum>>4) : (kiRowNum>>3);
int32_t iCountLoop8 = (kiRowNum-kiEdgeBlocks) >> 3;
const int32_t kiRemainingVectors = kiDiff - (iCountLoop8<<3);
const int32_t kiMatrixStride = MAX_VERTICAL_MV_RANGE;
ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixRef, 16, kiMatrixStride, 16 ); // transpose matrix result for ref
ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixEnc, 16, 16, 16 ); // transpose matrix result for enc
assert(kiRowNum <= kiMatrixStride); // make sure effective memory
TransposeMatrixBlock( &uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride );
TransposeMatrixBlocks( &uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum );
ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
int32_t iTargetPos = kiMinPos;
int16_t iBestPos = pMe->sMv.iMvX;
uint32_t uiBestCost = pMe->uiSadCost;
uint32_t uiCostMin;
int32_t iIndexMinPos;
kpEncMb = &uiMatrixEnc[0][0];
pRef = &uiMatrixRef[0][0];
while(iCountLoop8 > 0) {
CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
uiCostMin = pSampleSadHor8( kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos );
if (uiCostMin < uiBestCost) {
uiBestCost = uiCostMin;
iBestPos = iTargetPos+iIndexMinPos;
}
iTargetPos += 8;
pRef += 8;
-- iCountLoop8;
}
if (kiRemainingVectors > 0) {
kpEncMb = pMe->pEncMb;
pRef = &pMe->pColoRefMb[(iTargetPos - kiCurMeBlockPix)*kiRefStride];
while (iTargetPos < kiMaxPos) {
const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
if (uiSadCost < uiBestCost) {
uiBestCost = uiSadCost;
iBestPos = iTargetPos;
}
pRef += kiRefStride;
++iTargetPos;
}
}
if (uiBestCost < pMe->uiSadCost) {
SMVUnitXY sBestMv;
sBestMv.iMvX = 0;
sBestMv.iMvY = iBestPos - kiCurMeBlockPix;
UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY*kiRefStride], pMe );
}
}
void LineFullSearch_c( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch )
{
SWelsFuncPtrList *pFuncList = static_cast<SWelsFuncPtrList *>(pFunc);
SWelsME *pMe = static_cast<SWelsME *>(vpMe);
uint8_t *kpEncMb = pMe->pEncMb;
const int32_t kiCurMeBlockPix = pMe->iCurMeBlockPixX;
uint8_t *pRef = &pMe->pColoRefMb[kiMinPos - kiCurMeBlockPix];
const int32_t kIsBlock16x16 = pMe->uiBlockSize == BLOCK_16x16;
PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
const int32_t kiNumVector = kiMaxPos - kiMinPos;
int32_t iCountLoop8 = kiNumVector >> 3;
const int32_t kiRemainingLoop8 = kiNumVector & 7;
int32_t iTargetPos = kiMinPos;
int16_t iBestPos = pMe->sMv.iMvX;
uint32_t uiBestCost = pMe->uiSadCost;
uint32_t uiCostMin;
int32_t iIndexMinPos;
while(iCountLoop8 > 0) {
CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
uiCostMin = pSampleSadHor8( kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos );
if (uiCostMin < uiBestCost) {
uiBestCost = uiCostMin;
iBestPos = iTargetPos+iIndexMinPos;
}
iTargetPos += 8;
pRef += 8;
-- iCountLoop8;
}
if ( kiRemainingLoop8 > 0 ) {
while (iTargetPos < kiMaxPos) {
const uint16_t pMvdCost = pMvdTable[iTargetPos<<2];
uint32_t uiSadCost = pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
if (uiSadCost < uiBestCost) {
uiBestCost = uiSadCost;
iBestPos = iTargetPos;
}
++pRef;
++iTargetPos;
}
}
if (uiBestCost < pMe->uiSadCost) {
SMVUnitXY sBestMv;
sBestMv.iMvX = iBestPos - kiCurMeBlockPix;
sBestMv.iMvY = 0;
UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY], pMe );
}
}
#endif
void LineFullSearch_c( void *pFunc, void *vpMe,
uint16_t* pMvdTable, const int32_t kiFixedMvd,
const int32_t kiEncStride, const int32_t kiRefStride,
const int32_t kiMinPos, const int32_t kiMaxPos,
const bool bVerticalSearch ) {
SWelsFuncPtrList *pFuncList = static_cast<SWelsFuncPtrList *>(pFunc);
SWelsME *pMe = static_cast<SWelsME *>(vpMe);
@ -346,8 +483,8 @@ void LineFullSearch_c( void *pFunc, void *vpMe,
void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList, SWelsME * pMe,
const SSlice* pSlice, const int32_t kiEncStride, const int32_t kiRefStride) {
PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfLineFullSearch;
PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfLineFullSearch;
PLineFullSearchFunc pfVerticalFullSearchFunc = pFuncList->pfVerticalFullSearch;
PLineFullSearchFunc pfHorizontalFullSearchFunc = pFuncList->pfHorizontalFullSearch;
const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX;
const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);

View File

@ -0,0 +1,395 @@
;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* ?Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* ?Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*************************************************************************/
%include "asm_inc.asm"
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE_8x8B_MMX 10
MMX_XSwap bw, %1, %2, %8
MMX_XSwap bw, %3, %4, %2
MMX_XSwap bw, %5, %6, %4
movq %6, %9
movq %10, %4
MMX_XSwap bw, %7, %6, %4
MMX_XSwap wd, %1, %3, %6
MMX_XSwap wd, %8, %2, %3
MMX_XSwap wd, %5, %7, %2
movq %7, %10
movq %10, %3
MMX_XSwap wd, %7, %4, %3
MMX_XSwap dq, %1, %5, %4
MMX_XSwap dq, %6, %2, %5
MMX_XSwap dq, %8, %7, %2
movq %7, %10
movq %10, %5
MMX_XSwap dq, %7, %3, %5
movq %3, %10
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], mm5 ; result of line 3
movq [%1+%2], mm2 ; result of line 4
lea %1, [%1+2*%2]
movq [%1], mm7 ; result of line 5
movq [%1+%2], mm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], mm6 ; result of line 7
movq [%1+%2], mm4 ; result of line 8
%endmacro
;in: m0, m3, m5, m2, m7, m1, m6, m4
%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
movq [%1], mm0 ; result of line 1, x8 bytes
movq [%1+%2], mm3 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], mm5 ; result of line 3
movq [%3+%2], mm2 ; result of line 4
lea %3, [%3+2*%2]
movq [%3], mm7 ; result of line 5
movq [%3+%2], mm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], mm6 ; result of line 7
movq [%3+%2], mm4 ; result of line 8
%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
; for transpose 16x8
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
%macro TRANSPOSE_8x16B_SSE2 10
SSE2_XSawp bw, %1, %2, %8
SSE2_XSawp bw, %3, %4, %2
SSE2_XSawp bw, %5, %6, %4
movdqa %6, %9
movdqa %10, %4
SSE2_XSawp bw, %7, %6, %4
SSE2_XSawp wd, %1, %3, %6
SSE2_XSawp wd, %8, %2, %3
SSE2_XSawp wd, %5, %7, %2
movdqa %7, %10
movdqa %10, %3
SSE2_XSawp wd, %7, %4, %3
SSE2_XSawp dq, %1, %5, %4
SSE2_XSawp dq, %6, %2, %5
SSE2_XSawp dq, %8, %7, %2
movdqa %7, %10
movdqa %10, %5
SSE2_XSawp dq, %7, %3, %5
SSE2_XSawp qdq, %1, %8, %3
SSE2_XSawp qdq, %4, %2, %8
SSE2_XSawp qdq, %6, %7, %2
movdqa %7, %10
movdqa %10, %1
SSE2_XSawp qdq, %7, %5, %1
movdqa %5, %10
%endmacro ; end of TRANSPOSE_8x16B_SSE2
%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %1, [%1+2*%2]
movq [%1], xmm3 ; result of line 3
movq [%1+%2], xmm7 ; result of line 4
lea %1, [%1+2*%2]
movq [%1], xmm5 ; result of line 5
movq [%1+%2], xmm1 ; result of line 6
lea %1, [%1+2*%2]
movq [%1], xmm6 ; result of line 7
movq [%1+%2], xmm0 ; result of line 8
lea %1, [%1+2*%2]
movhpd [%1], xmm4 ; result of line 9
movhpd [%1+%2], xmm2 ; result of line 10
lea %1, [%1+2*%2]
movhpd [%1], xmm3 ; result of line 11
movhpd [%1+%2], xmm7 ; result of line 12
lea %1, [%1+2*%2]
movhpd [%1], xmm5 ; result of line 13
movhpd [%1+%2], xmm1 ; result of line 14
lea %1, [%1+2*%2]
movhpd [%1], xmm6 ; result of line 15
movhpd [%1+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
movq [%1], xmm4 ; result of line 1, x8 bytes
movq [%1+%2], xmm2 ; result of line 2
lea %3, [%1+2*%2]
movq [%3], xmm3 ; result of line 3
movq [%3+%2], xmm7 ; result of line 4
lea %3, [%3+2*%2]
movq [%3], xmm5 ; result of line 5
movq [%3+%2], xmm1 ; result of line 6
lea %3, [%3+2*%2]
movq [%3], xmm6 ; result of line 7
movq [%3+%2], xmm0 ; result of line 8
lea %3, [%3+2*%2]
movhpd [%3], xmm4 ; result of line 9
movhpd [%3+%2], xmm2 ; result of line 10
lea %3, [%3+2*%2]
movhpd [%3], xmm3 ; result of line 11
movhpd [%3+%2], xmm7 ; result of line 12
lea %3, [%3+2*%2]
movhpd [%3], xmm5 ; result of line 13
movhpd [%3+%2], xmm1 ; result of line 14
lea %3, [%3+2*%2]
movhpd [%3], xmm6 ; result of line 15
movhpd [%3+%2], xmm0 ; result of line 16
%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
SECTION .text
WELS_EXTERN TransposeMatrixBlock16x16_sse2
; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
push r4
push r5
%assign push_num 2
LOAD_4_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
mov r4, r7
and r4, 0Fh
sub r7, 10h
sub r7, r4
lea r5, [r3+r3*2]
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
TRANSPOSE8x16_WRITE_SSE2 r0, r1
; bottom 8x16 block
lea r2, [r2+r3*4]
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
movdqa xmm2, [r2+r3*2]
movdqa xmm3, [r2+r5]
lea r2, [r2+r3*4]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
movdqa xmm6, [r2+r3*2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
mov r5, r1
sal r5, 4
sub r0, r5
lea r0, [r0+r1*2+8]
TRANSPOSE8x16_WRITE_SSE2 r0, r1
add r7, r4
add r7, 10h
POP_XMM
LOAD_4_PARA_POP
pop r5
pop r4
ret
WELS_EXTERN TransposeMatrixBlocksx16_sse2
; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
push r5
push r6
%assign push_num 2
LOAD_5_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
mov r5, r7
and r5, 0Fh
sub r7, 10h
sub r7, r5
TRANSPOSE_LOOP_SSE2:
; explictly loading next loop data
lea r6, [r2+r3*8]
push r4
%rep 8
mov r4, [r6]
mov r4, [r6+r3]
lea r6, [r6+r3*2]
%endrep
pop r4
; top 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
lea r2, [r2+r3*2]
; bottom 8x16 block
movdqa xmm0, [r2]
movdqa xmm1, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm2, [r2]
movdqa xmm3, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm4, [r2]
movdqa xmm5, [r2+r3]
lea r2, [r2+r3*2]
movdqa xmm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m4, m2, m3, m7, m5, m1, m6, m0
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
lea r2, [r2+r3*2]
lea r0, [r0+16]
dec r4
jg near TRANSPOSE_LOOP_SSE2
add r7, r5
add r7, 10h
POP_XMM
LOAD_5_PARA_POP
pop r6
pop r5
ret
WELS_EXTERN TransposeMatrixBlock8x8_mmx
; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
%assign push_num 0
LOAD_4_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
sub r7, 8
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_MMX r0, r1
emms
add r7, 8
LOAD_4_PARA_POP
ret
WELS_EXTERN TransposeMatrixBlocksx8_mmx
; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
push r5
push r6
%assign push_num 2
LOAD_5_PARA
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
SIGN_EXTENSION r4, r4d
sub r7, 8
lea r5, [r2+r3*8]
TRANSPOSE_BLOCKS_X8_LOOP_MMX:
; explictly loading next loop data
%rep 4
mov r6, [r5]
mov r6, [r5+r3]
lea r5, [r5+r3*2]
%endrep
movq mm0, [r2]
movq mm1, [r2+r3]
lea r2, [r2+2*r3]
movq mm2, [r2]
movq mm3, [r2+r3]
lea r2, [r2+2*r3]
movq mm4, [r2]
movq mm5, [r2+r3]
lea r2, [r2+2*r3]
movq mm6, [r2]
;in: m0, m1, m2, m3, m4, m5, m6, m7
;out: m0, m3, m5, m2, m7, m1, m6, m4
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
lea r0, [r0+8]
lea r2, [r2+2*r3]
dec r4
jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
emms
add r7, 8
LOAD_5_PARA_POP
pop r6
pop r5
ret

View File

@ -0,0 +1,225 @@
;*!
;* \copy
;* Copyright (c) 2009-2013, Cisco Systems
;* All rights reserved.
;*
;* Redistribution and use in source and binary forms, with or without
;* modification, are permitted provided that the following conditions
;* are met:
;*
;* * Redistributions of source code must retain the above copyright
;* notice, this list of conditions and the following disclaimer.
;*
;* * Redistributions in binary form must reproduce the above copyright
;* notice, this list of conditions and the following disclaimer in
;* the documentation and/or other materials provided with the
;* distribution.
;*
;* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
;* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
;* COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
;* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
;* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
;* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
;* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
;* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
;* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
;* POSSIBILITY OF SUCH DAMAGE.
;*
;*************************************************************************/
%include "asm_inc.asm"
SECTION .text
;**********************************************************************************************************************************
;
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
;
; \note:
; src need align with 16 bytes, ref is optional
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
add %1, %3
add %2, %4
%endmacro ; end of SAD_16x16_LINE_SSE41
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqa xmm0, [%1]
movdqu xmm1, [%2]
movdqu xmm2, [%2+8h]
movdqa xmm3, xmm1
movdqa xmm4, xmm2
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm3, xmm0, 5 ; 101 B
paddw xmm7, xmm3 ; accumulate cost
mpsadbw xmm2, xmm0, 2 ; 010 B
paddw xmm7, xmm2 ; accumulate cost
mpsadbw xmm4, xmm0, 7 ; 111 B
paddw xmm7, xmm4 ; accumulate cost
%endmacro ; end of SAD_16x16_LINE_SSE41E
WELS_EXTERN SampleSad16x16Hor8_sse41
;push ebx
;push esi
;mov eax, [esp+12] ; src
;mov ecx, [esp+16] ; stride_src
;mov ebx, [esp+20] ; ref
;mov edx, [esp+24] ; stride_ref
;mov esi, [esp+28] ; base_cost
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
pxor xmm7, xmm7
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
SAD_16x16_LINE_SSE41E r0, r2, r1, r3
pxor xmm0, xmm0
movdqa xmm6, xmm7
punpcklwd xmm6, xmm0
punpckhwd xmm7, xmm0
movdqa xmm5, [r4]
movdqa xmm4, xmm5
punpcklwd xmm4, xmm0
punpckhwd xmm5, xmm0
paddd xmm4, xmm6
paddd xmm5, xmm7
movdqa xmm3, xmm4
pminud xmm3, xmm5
pshufd xmm2, xmm3, 01001110B
pminud xmm2, xmm3
pshufd xmm3, xmm2, 10110001B
pminud xmm2, xmm3
movd retrd, xmm2
pcmpeqd xmm4, xmm2
movmskps r2d, xmm4
bsf r1d, r2d
jnz near WRITE_INDEX
pcmpeqd xmm5, xmm2
movmskps r2d, xmm5
bsf r1d, r2d
add r1d, 4
WRITE_INDEX:
mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
ret
;**********************************************************************************************************************************
;
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
;
; \note:
; src and ref is optional to align with 16 due inter 8x8
; \return value:
; return minimal SAD cost, according index carried by index_min_cost
;
;**********************************************************************************************************************************
; try 8 mv via offset
; xmm7 store sad costs
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
add %1, %3
add %2, %4
%endmacro ; end of SAD_8x8_LINE_SSE41
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
movdqu xmm0, [%1]
movdqu xmm1, [%2]
movdqa xmm2, xmm1
mpsadbw xmm1, xmm0, 0 ; 000 B
paddw xmm7, xmm1 ; accumulate cost
mpsadbw xmm2, xmm0, 5 ; 101 B
paddw xmm7, xmm2 ; accumulate cost
%endmacro ; end of SAD_8x8_LINE_SSE41E
WELS_EXTERN SampleSad8x8Hor8_sse41
%assign push_num 0
LOAD_6_PARA
PUSH_XMM 8
SIGN_EXTENSION r1, r1d
SIGN_EXTENSION r3, r3d
movdqa xmm7, [r4] ; load base cost list
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
SAD_8x8_LINE_SSE41E r0, r2, r1, r3
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
mov r1d, retrd
and retrd, 0xFFFF
sar r1d, 16
mov [r5], r1d
POP_XMM
LOAD_6_PARA_POP
ret

View File

@ -40,8 +40,10 @@ ENCODER_ASM_SRCS=\
$(ENCODER_SRCDIR)/core/x86/coeff.asm\
$(ENCODER_SRCDIR)/core/x86/dct.asm\
$(ENCODER_SRCDIR)/core/x86/intra_pred.asm\
$(ENCODER_SRCDIR)/core/x86/matrix_transpose.asm\
$(ENCODER_SRCDIR)/core/x86/memzero.asm\
$(ENCODER_SRCDIR)/core/x86/quant.asm\
$(ENCODER_SRCDIR)/core/x86/sample_sc.asm\
$(ENCODER_SRCDIR)/core/x86/score.asm\
ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.$(OBJ))

View File

@ -5,6 +5,7 @@
#include "sample.h"
#include "svc_motion_estimate.h"
#include "wels_func_ptr_def.h"
#include "cpu.h"
using namespace WelsSVCEnc;
@ -43,19 +44,21 @@ public:
m_iMaxSearchBlock = 16;
m_uiMvdTableSize = (1 + (648 << 1));
pMa = new CMemoryAlign(0);
m_pRefPic = static_cast<uint8_t *>
(malloc(m_iWidth*m_iHeight));
(pMa->WelsMalloc(m_iWidth*m_iHeight, "RefPic"));
ASSERT_TRUE( NULL != m_pRefPic );
m_pSrcBlock = static_cast<uint8_t *>
(malloc(m_iMaxSearchBlock*m_iMaxSearchBlock));
(pMa->WelsMalloc(m_iMaxSearchBlock*m_iMaxSearchBlock, "SrcBlock"));
ASSERT_TRUE( NULL != m_pSrcBlock );
m_pMvdCostTable=new uint16_t[52*m_uiMvdTableSize];
ASSERT_TRUE( NULL != m_pMvdCostTable );
}
virtual void TearDown() {
delete [] m_pMvdCostTable;
free( m_pRefPic );
free( m_pSrcBlock );
pMa->WelsFree( m_pRefPic, "RefPic");
pMa->WelsFree( m_pSrcBlock, "SrcBlock");
delete pMa;
}
public:
uint8_t *m_pRefPic;
@ -66,6 +69,7 @@ public:
int32_t m_iWidth;
int32_t m_iHeight;
int32_t m_iMaxSearchBlock;
CMemoryAlign *pMa;
};
@ -243,4 +247,134 @@ TEST_F(MotionEstimateTest, TestHorizontalSearch) {
ASSERT_TRUE(iTryTimes > 0);
//it is possible that ref at differnt position is identical, but that should be under a low probability
}
}
}
#ifdef X86_ASM
TEST_F(MotionEstimateTest, TestVerticalSearch_SSE41)
{
const int32_t kiMaxBlock16Sad = 72000;//a rough number
SWelsFuncPtrList sFuncList;
SWelsME sMe;
srand((uint32_t)time(NULL));
const uint8_t kuiQp = rand()%52;
InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
SMVUnitXY sTargetMv;
WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
sMe.iCurMeBlockPixX = (m_iWidth/2);
sMe.iCurMeBlockPixY = (m_iHeight/2);
bool bDataGeneratorSucceed = false;
bool bFoundMatch = false;
int32_t iTryTimes=100;
sTargetMv.iMvX = 0;
sTargetMv.iMvY = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iHeight-INTPEL_NEEDED_MARGIN);
bDataGeneratorSucceed = false;
bFoundMatch = false;
while (!bFoundMatch && (iTryTimes--)>0) {
if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
continue;
bDataGeneratorSucceed = true;
CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
//clean the sMe status
sMe.uiBlockSize = rand()%5;
sMe.pEncMb = m_pSrcBlock;
sMe.pRefMb = pRefPicCenter;
sMe.pColoRefMb = pRefPicCenter;
sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX; //do the offset here
uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
VerticalFullSearchUsingSSE41 ( &sFuncList, &sMe,
pMvdCostY, pMvdCostX[ iCurMeBlockQpelPixX ],
m_iMaxSearchBlock, m_iWidth,
INTPEL_NEEDED_MARGIN,
m_iHeight-INTPEL_NEEDED_MARGIN, true );
//the last selection may be affected by MVDcost, that is when smaller MvY will be better
bFoundMatch = (sMe.sMv.iMvX==0
&&(sMe.sMv.iMvY==sTargetMv.iMvY||abs(sMe.sMv.iMvY)<abs(sTargetMv.iMvY)));
//printf("TestVerticalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
}
if (bDataGeneratorSucceed) {
//if DataGenerator never succeed, there is no meaning to check iTryTimes
ASSERT_TRUE(iTryTimes > 0);
//it is possible that ref at differnt position is identical, but that should be under a low probability
}
}
TEST_F(MotionEstimateTest, TestHorizontalSearch_SSE41)
{
const int32_t kiMaxBlock16Sad = 72000;//a rough number
SWelsFuncPtrList sFuncList;
SWelsME sMe;
srand((uint32_t)time(NULL));
const uint8_t kuiQp = rand()%52;
InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
SMVUnitXY sTargetMv;
WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
sMe.iCurMeBlockPixX = (m_iWidth/2);
sMe.iCurMeBlockPixY = (m_iHeight/2);
bool bDataGeneratorSucceed = false;
bool bFoundMatch = false;
int32_t iTryTimes=100;
sTargetMv.iMvX = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iWidth-INTPEL_NEEDED_MARGIN);
sTargetMv.iMvY = 0;
bDataGeneratorSucceed = false;
bFoundMatch = false;
while (!bFoundMatch && (iTryTimes--)>0) {
if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
continue;
bDataGeneratorSucceed = true;
CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
//clean the sMe status
sMe.uiBlockSize = rand()%5;
sMe.pEncMb = m_pSrcBlock;
sMe.pRefMb = pRefPicCenter;
sMe.pColoRefMb = pRefPicCenter;
sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX; //do the offset here
uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
HorizontalFullSearchUsingSSE41 ( &sFuncList, &sMe,
pMvdCostX, pMvdCostY[ iCurMeBlockQpelPixY ],
m_iMaxSearchBlock, m_iWidth,
INTPEL_NEEDED_MARGIN,
m_iWidth-INTPEL_NEEDED_MARGIN, false );
//the last selection may be affected by MVDcost, that is when smaller MvY will be better
bFoundMatch = (sMe.sMv.iMvY==0
&&(sMe.sMv.iMvX==sTargetMv.iMvX||abs(sMe.sMv.iMvX)<abs(sTargetMv.iMvX)));
//printf("TestHorizontalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
}
if (bDataGeneratorSucceed) {
//if DataGenerator never succeed, there is no meaning to check iTryTimes
ASSERT_TRUE(iTryTimes > 0);
//it is possible that ref at differnt position is identical, but that should be under a low probability
}
}
#endif