Add ASM related functions for ME cross search

Add asm level functions Add asm code for ME Modify format Add unit test for asm code. Modify function name and format. Remove unuse comment Modify targets file Add Macro protect for SSE41 funtion test Modify according to review request.
2014-03-28 10:22:11 +08:00 · 2014-03-28 10:22:11 +08:00 · 5c60e8f868
commit 5c60e8f868
parent 94cabe10d5
10 changed files with 980 additions and 14 deletions
--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj
@ -81,6 +81,7 @@
 		4C34066918C57D0400DFA14A /* memory_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = memory_neon.S; sourceTree = "<group>"; };
 		4C34066A18C57D0400DFA14A /* pixel_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = pixel_neon.S; sourceTree = "<group>"; };
 		4C34066B18C57D0400DFA14A /* reconstruct_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = reconstruct_neon.S; sourceTree = "<group>"; };
+		4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = wels_transpose_matrix.h; sourceTree = "<group>"; };
 		4CE4431118B6FFA00017DF25 /* libwelsenc.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libwelsenc.a; sourceTree = BUILT_PRODUCTS_DIR; };
 		4CE4431418B6FFA00017DF25 /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; };
 		4CE4432118B6FFA00017DF25 /* welsencTests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = welsencTests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
@ -281,6 +282,7 @@
 		4CE446A918BC605C0017DF25 /* inc */ = {
 			isa = PBXGroup;
 			children = (
+				4CDBFB9D18E5068D0025A767 /* wels_transpose_matrix.h */,
 				4CE446AA18BC605C0017DF25 /* as264_common.h */,
 				4CE446AB18BC605C0017DF25 /* au_set.h */,
 				4CE446AC18BC605C0017DF25 /* bit_stream.h */,
--- a/codec/encoder/core/inc/svc_motion_estimate.h
+++ b/codec/encoder/core/inc/svc_motion_estimate.h
@ -199,11 +199,24 @@ void LineFullSearch_c(   void *pFunc, void *vpMe,
                        const int32_t kiEncStride, const int32_t kiRefStride,
                        const int32_t kiMinPos, const int32_t kiMaxPos,
                        const bool bVerticalSearch );
+#ifdef X86_ASM
+extern "C"
+{
+uint32_t SampleSad8x8Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
+uint32_t SampleSad16x16Hor8_sse41 (uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t*);
+}
+
 void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
                            uint16_t* pMvdTable, const int32_t kiFixedMvd,
                            const int32_t kiEncStride, const int32_t kiRefStride,
                          const int32_t kiMinPos, const int32_t kiMaxPos,
                          const bool bVerticalSearch );
+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
+                                      uint16_t* pMvdTable, const int32_t kiFixedMvd,
+                                      const int32_t kiEncStride, const int32_t kiRefStride,
+                                      const int32_t kiMinPos, const int32_t kiMaxPos,
+                                      const bool bVerticalSearch );
+#endif
 void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList,  SDqLayer* pCurLayer, SWelsME * pMe, const SSlice* pSlice);

 // Feature Search Basics
--- a/codec/encoder/core/inc/wels_const.h
+++ b/codec/encoder/core/inc/wels_const.h
@ -87,6 +87,7 @@
 #define PARA_SET_TYPE_SUBSETSPS	1
 #define PARA_SET_TYPE_PPS		2

+#define MAX_VERTICAL_MV_RANGE   1024  //TODO, for allocate enough memory for transpose
 #define MAX_FRAME_RATE			30	// maximal frame rate to support
 #define MIN_FRAME_RATE			1	// minimal frame rate need support

--- a/codec/encoder/core/inc/wels_func_ptr_def.h
+++ b/codec/encoder/core/inc/wels_func_ptr_def.h
@ -134,6 +134,7 @@ typedef int32_t (*PIntraPred16x16Combined3Func) (uint8_t*, int32_t, uint8_t*, in
 typedef int32_t (*PIntraPred8x8Combined3Func) (uint8_t*, int32_t, uint8_t*, int32_t, int32_t*, int32_t, uint8_t*,
    uint8_t*, uint8_t*);

+typedef uint32_t (*PSampleSadHor8Func)( uint8_t*, int32_t, uint8_t*, int32_t, uint16_t*, int32_t* );
 typedef void (*PMotionSearchFunc) (SWelsFuncPtrList* pFuncList, void* pCurDqLayer, void* pMe,
                                   void* pSlice);
 typedef void (*PSearchMethodFunc) (SWelsFuncPtrList* pFuncList, void* pMe, void* pSlice, const int32_t kiEncStride, const int32_t kiRefStride);
@ -202,14 +203,16 @@ struct TagWelsFuncPointerList {
  PGetIntraPredFunc     pfGetLumaI4x4Pred[I4_PRED_A];
  PGetIntraPredFunc     pfGetChromaPred[C_PRED_A];

+  PSampleSadHor8Func	pfSampleSadHor8[2];	// 0: for 16x16 square; 1: for 8x8 square
  PMotionSearchFunc
  pfMotionSearch[BLOCK_STATIC_IDC_ALL]; //svc_encode_slice.c svc_mode_decision.c svc_enhance_layer_md.c svc_base_layer_md.c
  PSearchMethodFunc pfSearchMethod[BLOCK_SIZE_ALL];
  PCalculateSatdFunc pfCalculateSatd;
  PCheckDirectionalMv pfCheckDirectionalMv;
-  PLineFullSearchFunc pfLineFullSearch;
  PCalculateBlockFeatureOfFrame pfCalculateBlockFeatureOfFrame[2];//0 - for 8x8, 1 for 16x16
  PCalculateSingleBlockFeature pfCalculateSingleBlockFeature[2];//0 - for 8x8, 1 for 16x16
+  PLineFullSearchFunc pfVerticalFullSearch;
+  PLineFullSearchFunc pfHorizontalFullSearch;

  PCopyFunc      pfCopy16x16Aligned;    //svc_encode_slice.c svc_mode_decision.c svc_base_layer_md.c
  PCopyFunc      pfCopy16x16NotAligned;  //md.c
--- a/codec/encoder/core/inc/wels_transpose_matrix.h
+++ b/codec/encoder/core/inc/wels_transpose_matrix.h
@ -0,0 +1,54 @@
+/*!
+ * \copy
+ *     Copyright (c)  2013, Cisco Systems
+ *     All rights reserved.
+ *
+ *     Redistribution and use in source and binary forms, with or without
+ *     modification, are permitted provided that the following conditions
+ *     are met:
+ *
+ *        * Redistributions of source code must retain the above copyright
+ *          notice, this list of conditions and the following disclaimer.
+ *
+ *        * Redistributions in binary form must reproduce the above copyright
+ *          notice, this list of conditions and the following disclaimer in
+ *          the documentation and/or other materials provided with the
+ *          distribution.
+ *
+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *     POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+#ifndef WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
+#define WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
+
+#include "typedefs.h"
+
+namespace WelsSVCEnc {
+
+#ifdef X86_ASM
+extern "C"
+{
+void TransposeMatrixBlocksx16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+void TransposeMatrixBlock16x16_sse2( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+void TransposeMatrixBlocksx8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+void TransposeMatrixBlock8x8_mmx( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+}
+#endif
+
+typedef void (*PTransposeMatrixBlockFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride );
+typedef void (*PTransposeMatrixBlocksFunc)( void *pDst, const int32_t kiDstStride, void *pSrc, const int32_t kiSrcStride, const int32_t kiBlocksNum );
+
+}// end of namespace declaration
+
+#endif//WELS_RUBY_ENCODER_TRANSPOSE_MATRIX_H__
--- a/codec/encoder/core/src/svc_motion_estimate.cpp
+++ b/codec/encoder/core/src/svc_motion_estimate.cpp
@ -41,6 +41,7 @@
 #include "cpu_core.h"
 #include "ls_defines.h"
 #include "svc_motion_estimate.h"
+#include "wels_transpose_matrix.h"

 namespace WelsSVCEnc {

@ -65,8 +66,14 @@ void WelsInitMeFunc( SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
    pFuncList->pfCheckDirectionalMv = CheckDirectionalMv;

    //for cross serarch
-    pFuncList->pfLineFullSearch = LineFullSearch_c;
+    pFuncList->pfVerticalFullSearch = LineFullSearch_c;
+    pFuncList->pfHorizontalFullSearch = LineFullSearch_c;
+#if defined (X86_ASM)
    if ( uiCpuFlag & WELS_CPU_SSE41 ) {
+      pFuncList->pfSampleSadHor8[0] = SampleSad8x8Hor8_sse41;
+      pFuncList->pfSampleSadHor8[1] = SampleSad16x16Hor8_sse41;
+      pFuncList->pfVerticalFullSearch = VerticalFullSearchUsingSSE41;
+      pFuncList->pfHorizontalFullSearch = HorizontalFullSearchUsingSSE41;
    }

    //for feature search
@ -75,6 +82,7 @@ void WelsInitMeFunc( SWelsFuncPtrList* pFuncList, uint32_t uiCpuFlag, bool bScre
    //TODO: it is possible to differentiate width that is times of 8, so as to accelerate the speed when width is times of 8?
    pFuncList->pfCalculateSingleBlockFeature[0] = SumOf8x8SingleBlock_c;
    pFuncList->pfCalculateSingleBlockFeature[1] = SumOf16x16SingleBlock_c;
+#endif
  }
 }

@ -302,18 +310,147 @@ bool CheckDirectionalMvFalse(PSampleSadSatdCostFunc pSad, void * vpMe,
 /////////////////////////
 // Cross Search Basics
 /////////////////////////
+#if defined (X86_ASM)
+void CalcMvdCostx8_c( uint16_t *pMvdCost, const int32_t kiStartMv, uint16_t* pMvdTable, const uint16_t kiFixedCost )
+{
+  uint16_t *pBaseCost		= pMvdCost;
+  const int32_t kiOffset	= (kiStartMv<<2);
+  uint16_t *pMvd		= pMvdTable+kiOffset;
+  for (int32_t i = 0; i < 8; ++ i) {
+    pBaseCost[i] = ((*pMvd) + kiFixedCost);
+    pMvd += 4;
+  }
+}
 void VerticalFullSearchUsingSSE41( void *pFunc, void *vpMe,
                            uint16_t* pMvdTable, const int32_t kiFixedMvd,
                            const int32_t kiEncStride, const int32_t kiRefStride,
                          const int32_t kiMinPos, const int32_t kiMaxPos,
                          const bool bVerticalSearch ) {
  SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
-  SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);
+  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);
+  uint8_t*  kpEncMb	= pMe->pEncMb;
+  const int32_t kiCurMeBlockPix	= pMe->iCurMeBlockPixY;
+  uint8_t* pRef			      = &pMe->pColoRefMb[(kiMinPos - kiCurMeBlockPix)*kiRefStride];
+  const int32_t kIsBlock16x16	= pMe->uiBlockSize == BLOCK_16x16;
+  const int32_t kiEdgeBlocks	= kIsBlock16x16 ? 16 : 8;
+  PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
+  PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
+  PTransposeMatrixBlockFunc	TransposeMatrixBlock = kIsBlock16x16 ? TransposeMatrixBlock16x16_sse2 : TransposeMatrixBlock8x8_mmx;
+  PTransposeMatrixBlocksFunc	TransposeMatrixBlocks= kIsBlock16x16 ? TransposeMatrixBlocksx16_sse2 : TransposeMatrixBlocksx8_mmx;
+
+  const int32_t kiDiff			= kiMaxPos - kiMinPos;
+  const int32_t kiRowNum		= WELS_ALIGN((kiDiff - kiEdgeBlocks + 1), kiEdgeBlocks);
+  const int32_t kiBlocksNum		= kIsBlock16x16 ? (kiRowNum>>4) : (kiRowNum>>3);
+  int32_t iCountLoop8		= (kiRowNum-kiEdgeBlocks) >> 3;
+  const int32_t kiRemainingVectors		= kiDiff - (iCountLoop8<<3);
+  const int32_t kiMatrixStride		= MAX_VERTICAL_MV_RANGE;
+  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixRef, 16, kiMatrixStride, 16 );	// transpose matrix result for ref
+  ENFORCE_STACK_ALIGN_2D( uint8_t, uiMatrixEnc, 16, 16, 16 );				// transpose matrix result for enc
+  assert(kiRowNum <= kiMatrixStride);	// make sure effective memory
+
+  TransposeMatrixBlock( &uiMatrixEnc[0][0], 16, kpEncMb, kiEncStride );
+  TransposeMatrixBlocks( &uiMatrixRef[0][0], kiMatrixStride, pRef, kiRefStride, kiBlocksNum );
+  ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
+  int32_t iTargetPos			= kiMinPos;
+  int16_t iBestPos				= pMe->sMv.iMvX;
+  uint32_t uiBestCost			= pMe->uiSadCost;
+  uint32_t uiCostMin;
+  int32_t iIndexMinPos;
+  kpEncMb	= &uiMatrixEnc[0][0];
+  pRef	= &uiMatrixRef[0][0];
+
+  while(iCountLoop8 > 0) {
+    CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
+    uiCostMin = pSampleSadHor8( kpEncMb, 16, pRef, kiMatrixStride, uiBaseCost, &iIndexMinPos );
+    if (uiCostMin < uiBestCost) {
+      uiBestCost	= uiCostMin;
+      iBestPos		= iTargetPos+iIndexMinPos;
+    }
+    iTargetPos	+= 8;
+    pRef += 8;
+    -- iCountLoop8;
+  }
+  if (kiRemainingVectors > 0) {
+    kpEncMb	= pMe->pEncMb;
+    pRef	= &pMe->pColoRefMb[(iTargetPos - kiCurMeBlockPix)*kiRefStride];
+    while (iTargetPos < kiMaxPos) {
+      const uint16_t pMvdCost	= pMvdTable[iTargetPos<<2];
+      uint32_t uiSadCost	= pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+      if (uiSadCost < uiBestCost) {
+        uiBestCost	= uiSadCost;
+        iBestPos	= iTargetPos;
+      }
+      pRef += kiRefStride;
+      ++iTargetPos;
+    }
+  }
+  if (uiBestCost < pMe->uiSadCost) {
+    SMVUnitXY sBestMv;
+    sBestMv.iMvX = 0;
+    sBestMv.iMvY = iBestPos - kiCurMeBlockPix;
+    UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY*kiRefStride], pMe );
+  }
 }
-void LineFullSearch_c(  void *pFunc, void *vpMe,
-                          uint16_t* pMvdTable, const int32_t kiFixedMvd,
-                          const int32_t kiEncStride, const int32_t kiRefStride,
-                          const int32_t kiMinPos, const int32_t kiMaxPos,
+
+void HorizontalFullSearchUsingSSE41( void *pFunc, void *vpMe,
+                                      uint16_t* pMvdTable, const int32_t kiFixedMvd,
+                                      const int32_t kiEncStride, const int32_t kiRefStride,
+                                      const int32_t kiMinPos, const int32_t kiMaxPos,
+                                      const bool bVerticalSearch )
+{
+  SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
+  SWelsME *pMe				                    = static_cast<SWelsME *>(vpMe);
+  uint8_t *kpEncMb	= pMe->pEncMb;
+  const int32_t kiCurMeBlockPix	= pMe->iCurMeBlockPixX;
+  uint8_t *pRef			      = &pMe->pColoRefMb[kiMinPos - kiCurMeBlockPix];
+  const int32_t kIsBlock16x16	= pMe->uiBlockSize == BLOCK_16x16;
+  PSampleSadHor8Func pSampleSadHor8 = pFuncList->pfSampleSadHor8[kIsBlock16x16];
+  PSampleSadSatdCostFunc pSad = pFuncList->sSampleDealingFuncs.pfSampleSad[pMe->uiBlockSize];
+  ENFORCE_STACK_ALIGN_1D( uint16_t, uiBaseCost, 8, 16 );
+  const int32_t kiNumVector	= kiMaxPos - kiMinPos;
+  int32_t iCountLoop8	= kiNumVector >> 3;
+  const int32_t kiRemainingLoop8	= kiNumVector & 7;
+  int32_t iTargetPos			= kiMinPos;
+  int16_t iBestPos				= pMe->sMv.iMvX;
+  uint32_t uiBestCost			= pMe->uiSadCost;
+  uint32_t uiCostMin;
+  int32_t iIndexMinPos;
+
+  while(iCountLoop8 > 0) {
+    CalcMvdCostx8_c(uiBaseCost, iTargetPos, pMvdTable, kiFixedMvd);
+    uiCostMin = pSampleSadHor8( kpEncMb, kiEncStride, pRef, kiRefStride, uiBaseCost, &iIndexMinPos );
+    if (uiCostMin < uiBestCost) {
+      uiBestCost	= uiCostMin;
+      iBestPos		= iTargetPos+iIndexMinPos;
+    }
+    iTargetPos	+= 8;
+    pRef += 8;
+    -- iCountLoop8;
+  }
+  if ( kiRemainingLoop8 > 0 ) {
+    while (iTargetPos < kiMaxPos) {
+      const uint16_t pMvdCost	= pMvdTable[iTargetPos<<2];
+      uint32_t uiSadCost	= pSad( kpEncMb, kiEncStride, pRef, kiRefStride ) + (kiFixedMvd + pMvdCost);
+      if (uiSadCost < uiBestCost) {
+        uiBestCost	= uiSadCost;
+        iBestPos	= iTargetPos;
+      }
+      ++pRef;
+      ++iTargetPos;
+    }
+  }
+  if (uiBestCost < pMe->uiSadCost) {
+    SMVUnitXY sBestMv;
+    sBestMv.iMvX = iBestPos - kiCurMeBlockPix;
+    sBestMv.iMvY = 0;
+    UpdateMeResults( sBestMv, uiBestCost, &pMe->pColoRefMb[sBestMv.iMvY], pMe );
+  }
+}
+#endif
+void LineFullSearch_c(	void *pFunc, void *vpMe,
+													uint16_t* pMvdTable, const int32_t kiFixedMvd,
+													const int32_t kiEncStride, const int32_t kiRefStride,
+													const int32_t kiMinPos, const int32_t kiMaxPos,
                          const bool bVerticalSearch ) {
  SWelsFuncPtrList *pFuncList      = static_cast<SWelsFuncPtrList *>(pFunc);
  SWelsME *pMe                            = static_cast<SWelsME *>(vpMe);
@ -346,8 +483,8 @@ void LineFullSearch_c(  void *pFunc, void *vpMe,

 void WelsMotionCrossSearch(SWelsFuncPtrList *pFuncList,  SWelsME * pMe,
 											const SSlice* pSlice, const int32_t kiEncStride,  const int32_t kiRefStride) {
-  PLineFullSearchFunc pfVerticalFullSearchFunc	= pFuncList->pfLineFullSearch;
-  PLineFullSearchFunc pfHorizontalFullSearchFunc	= pFuncList->pfLineFullSearch;
+  PLineFullSearchFunc pfVerticalFullSearchFunc	= pFuncList->pfVerticalFullSearch;
+  PLineFullSearchFunc pfHorizontalFullSearchFunc	= pFuncList->pfHorizontalFullSearch;

  const int32_t iCurMeBlockPixX = pMe->iCurMeBlockPixX;
  const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
--- a/codec/encoder/core/x86/matrix_transpose.asm
+++ b/codec/encoder/core/x86/matrix_transpose.asm
@ -0,0 +1,395 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        ?Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        ?Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*************************************************************************/
+
+%include "asm_inc.asm"
+
+;in:  m0, m1, m2, m3, m4, m5, m6, m7
+;out: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE_8x8B_MMX 10
+	MMX_XSwap bw,  %1, %2, %8
+	MMX_XSwap bw,  %3, %4, %2
+	MMX_XSwap bw,  %5, %6, %4
+	movq	%6, %9
+	movq	%10, %4
+	MMX_XSwap bw,  %7, %6, %4
+
+	MMX_XSwap wd,  %1, %3, %6
+	MMX_XSwap wd,  %8, %2, %3
+	MMX_XSwap wd,  %5, %7, %2
+	movq	%7, %10
+	movq	%10, %3
+	MMX_XSwap wd,  %7, %4, %3
+
+	MMX_XSwap dq,  %1, %5, %4
+	MMX_XSwap dq,  %6, %2, %5
+	MMX_XSwap dq,  %8, %7, %2
+	movq	%7, %10
+	movq	%10, %5
+	MMX_XSwap dq,  %7, %3, %5
+
+	movq	%3, %10
+%endmacro
+
+;in: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE8x8_WRITE_MMX 2	; dst, dst_stride
+	movq [%1], mm0			; result of line 1, x8 bytes
+	movq [%1+%2], mm3		; result of line 2
+	lea %1, [%1+2*%2]
+	movq [%1], mm5			; result of line 3
+	movq [%1+%2], mm2		; result of line 4
+	lea %1, [%1+2*%2]
+	movq [%1], mm7			; result of line 5
+	movq [%1+%2], mm1		; result of line 6
+	lea %1, [%1+2*%2]
+	movq [%1], mm6			; result of line 7
+	movq [%1+%2], mm4		; result of line 8
+%endmacro
+
+;in: m0, m3, m5, m2, m7, m1, m6, m4
+%macro TRANSPOSE8x8_WRITE_ALT_MMX 3	; dst, dst_stride, reg32
+	movq [%1], mm0			; result of line 1, x8 bytes
+	movq [%1+%2], mm3		; result of line 2
+	lea %3, [%1+2*%2]
+	movq [%3], mm5			; result of line 3
+	movq [%3+%2], mm2		; result of line 4
+	lea %3, [%3+2*%2]
+	movq [%3], mm7			; result of line 5
+	movq [%3+%2], mm1		; result of line 6
+	lea %3, [%3+2*%2]
+	movq [%3], mm6			; result of line 7
+	movq [%3+%2], mm4		; result of line 8
+%endmacro	; end of TRANSPOSE8x8_WRITE_ALT_MMX
+
+; for transpose 16x8
+
+;in:  m0, m1, m2, m3, m4, m5, m6, m7
+;out: m4, m2, m3, m7, m5, m1, m6, m0
+%macro TRANSPOSE_8x16B_SSE2		10
+	SSE2_XSawp bw,  %1, %2, %8
+	SSE2_XSawp bw,  %3, %4, %2
+	SSE2_XSawp bw,  %5, %6, %4
+	movdqa	%6, %9
+	movdqa	%10, %4
+	SSE2_XSawp bw,  %7, %6, %4
+
+	SSE2_XSawp wd,  %1, %3, %6
+	SSE2_XSawp wd,  %8, %2, %3
+	SSE2_XSawp wd,  %5, %7, %2
+	movdqa	%7, %10
+	movdqa	%10, %3
+	SSE2_XSawp wd,  %7, %4, %3
+
+	SSE2_XSawp dq,  %1, %5, %4
+	SSE2_XSawp dq,  %6, %2, %5
+	SSE2_XSawp dq,  %8, %7, %2
+	movdqa	%7, %10
+	movdqa	%10, %5
+	SSE2_XSawp dq,  %7, %3, %5
+
+	SSE2_XSawp qdq,  %1, %8, %3
+	SSE2_XSawp qdq,  %4, %2, %8
+	SSE2_XSawp qdq,  %6, %7, %2
+	movdqa	%7, %10
+	movdqa	%10, %1
+	SSE2_XSawp qdq,  %7, %5, %1
+	movdqa	%5, %10
+%endmacro	; end of TRANSPOSE_8x16B_SSE2
+
+
+%macro TRANSPOSE8x16_WRITE_SSE2	2	; dst, dst_stride
+	movq [%1], xmm4			; result of line 1, x8 bytes
+	movq [%1+%2], xmm2		; result of line 2
+	lea %1, [%1+2*%2]
+	movq [%1], xmm3			; result of line 3
+	movq [%1+%2], xmm7		; result of line 4
+
+	lea %1, [%1+2*%2]
+	movq [%1], xmm5			; result of line 5
+	movq [%1+%2], xmm1		; result of line 6
+	lea %1, [%1+2*%2]
+	movq [%1], xmm6			; result of line 7
+	movq [%1+%2], xmm0		; result of line 8
+
+	lea %1, [%1+2*%2]
+	movhpd [%1], xmm4		; result of line 9
+	movhpd [%1+%2], xmm2	; result of line 10
+	lea %1, [%1+2*%2]
+	movhpd [%1], xmm3		; result of line 11
+	movhpd [%1+%2], xmm7	; result of line 12
+
+	lea %1, [%1+2*%2]
+	movhpd [%1], xmm5		; result of line 13
+	movhpd [%1+%2], xmm1	; result of line 14
+	lea %1, [%1+2*%2]
+	movhpd [%1], xmm6		; result of line 15
+	movhpd [%1+%2], xmm0	; result of line 16
+%endmacro	; end of TRANSPOSE_WRITE_RESULT_SSE2
+
+%macro TRANSPOSE8x16_WRITE_ALT_SSE2	3	; dst, dst_stride, reg32
+	movq [%1], xmm4			; result of line 1, x8 bytes
+	movq [%1+%2], xmm2		; result of line 2
+	lea %3, [%1+2*%2]
+	movq [%3], xmm3			; result of line 3
+	movq [%3+%2], xmm7		; result of line 4
+
+	lea %3, [%3+2*%2]
+	movq [%3], xmm5			; result of line 5
+	movq [%3+%2], xmm1		; result of line 6
+	lea %3, [%3+2*%2]
+	movq [%3], xmm6			; result of line 7
+	movq [%3+%2], xmm0		; result of line 8
+
+	lea %3, [%3+2*%2]
+	movhpd [%3], xmm4		; result of line 9
+	movhpd [%3+%2], xmm2	; result of line 10
+	lea %3, [%3+2*%2]
+	movhpd [%3], xmm3		; result of line 11
+	movhpd [%3+%2], xmm7	; result of line 12
+
+	lea %3, [%3+2*%2]
+	movhpd [%3], xmm5		; result of line 13
+	movhpd [%3+%2], xmm1	; result of line 14
+	lea %3, [%3+2*%2]
+	movhpd [%3], xmm6		; result of line 15
+	movhpd [%3+%2], xmm0	; result of line 16
+%endmacro	; end of TRANSPOSE8x16_WRITE_ALT_SSE2
+
+
+SECTION .text
+
+WELS_EXTERN TransposeMatrixBlock16x16_sse2
+; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
+	push r4
+	push r5
+	%assign push_num 2
+	LOAD_4_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION	r1, r1d
+	SIGN_EXTENSION	r3, r3d
+
+	mov r4, r7
+	and r4, 0Fh
+	sub r7, 10h
+	sub r7, r4
+	lea r5, [r3+r3*2]
+	; top 8x16 block
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+r3*2]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+r3*4]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+r3*2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m4, m2, m3, m7, m5, m1, m6, m0
+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+
+	TRANSPOSE8x16_WRITE_SSE2		r0, r1
+
+	; bottom 8x16 block
+	lea	r2, [r2+r3*4]
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	movdqa xmm2, [r2+r3*2]
+	movdqa xmm3, [r2+r5]
+	lea r2, [r2+r3*4]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	movdqa xmm6, [r2+r3*2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m4, m2, m3, m7, m5, m1, m6, m0
+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
+
+	mov r5, r1
+	sal r5, 4
+	sub r0, r5
+	lea r0, [r0+r1*2+8]
+	TRANSPOSE8x16_WRITE_SSE2		r0, r1
+
+	add r7, r4
+	add r7, 10h
+	POP_XMM
+	LOAD_4_PARA_POP
+	pop r5
+	pop r4
+	ret
+
+WELS_EXTERN TransposeMatrixBlocksx16_sse2
+; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
+	push r5
+	push r6
+	%assign push_num 2
+	LOAD_5_PARA
+	PUSH_XMM 8
+	SIGN_EXTENSION  r1, r1d
+	SIGN_EXTENSION  r3, r3d
+	SIGN_EXTENSION  r4, r4d
+	mov r5, r7
+	and r5, 0Fh
+	sub r7, 10h
+	sub r7, r5
+TRANSPOSE_LOOP_SSE2:
+	; explictly loading next loop data
+	lea	r6, [r2+r3*8]
+	push r4
+%rep 8
+	mov	r4, [r6]
+	mov	r4, [r6+r3]
+	lea	r6, [r6+r3*2]
+%endrep
+	pop r4
+	; top 8x16 block
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	lea r2, [r2+r3*2]
+	movdqa xmm2, [r2]
+	movdqa xmm3, [r2+r3]
+	lea r2, [r2+r3*2]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	lea r2, [r2+r3*2]
+	movdqa xmm6, [r2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m4, m2, m3, m7, m5, m1, m6, m0
+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+	TRANSPOSE8x16_WRITE_ALT_SSE2		r0, r1, r6
+	lea	r2, [r2+r3*2]
+
+	; bottom 8x16 block
+	movdqa xmm0, [r2]
+	movdqa xmm1, [r2+r3]
+	lea	r2, [r2+r3*2]
+	movdqa xmm2, [r2]
+	movdqa xmm3, [r2+r3]
+	lea r2, [r2+r3*2]
+	movdqa xmm4, [r2]
+	movdqa xmm5, [r2+r3]
+	lea	r2, [r2+r3*2]
+	movdqa xmm6, [r2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m4, m2, m3, m7, m5, m1, m6, m0
+	TRANSPOSE_8x16B_SSE2	xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
+	TRANSPOSE8x16_WRITE_ALT_SSE2		r0+8, r1, r6
+	lea	r2, [r2+r3*2]
+	lea r0, [r0+16]
+	dec r4
+	jg near TRANSPOSE_LOOP_SSE2
+
+	add r7, r5
+	add r7, 10h
+	POP_XMM
+	LOAD_5_PARA_POP
+	pop r6
+	pop r5
+	ret
+
+WELS_EXTERN TransposeMatrixBlock8x8_mmx
+; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
+	%assign push_num 0
+	LOAD_4_PARA
+	SIGN_EXTENSION  r1, r1d
+	SIGN_EXTENSION  r3, r3d
+	sub	r7, 8
+
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m0, m3, m5, m2, m7, m1, m6, m4
+	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+
+	TRANSPOSE8x8_WRITE_MMX r0, r1
+
+	emms
+	add r7, 8
+	LOAD_4_PARA_POP
+	ret
+
+WELS_EXTERN TransposeMatrixBlocksx8_mmx
+; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
+	push r5
+	push r6
+	%assign push_num 2
+	LOAD_5_PARA
+	SIGN_EXTENSION  r1, r1d
+	SIGN_EXTENSION  r3, r3d
+	SIGN_EXTENSION  r4, r4d
+	sub	r7, 8
+
+	lea	r5, [r2+r3*8]
+
+TRANSPOSE_BLOCKS_X8_LOOP_MMX:
+	; explictly loading next loop data
+%rep 4
+	mov r6, [r5]
+	mov r6, [r5+r3]
+	lea	r5, [r5+r3*2]
+%endrep
+	movq mm0, [r2]
+	movq mm1, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm2, [r2]
+	movq mm3, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm4, [r2]
+	movq mm5, [r2+r3]
+	lea r2, [r2+2*r3]
+	movq mm6, [r2]
+
+	;in:  m0, m1, m2, m3, m4, m5, m6, m7
+	;out: m0, m3, m5, m2, m7, m1, m6, m4
+	TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
+
+	TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
+	lea r0, [r0+8]
+	lea r2, [r2+2*r3]
+	dec r4
+	jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
+
+	emms
+	add r7, 8
+	LOAD_5_PARA_POP
+	pop r6
+	pop r5
+	ret
--- a/codec/encoder/core/x86/sample_sc.asm
+++ b/codec/encoder/core/x86/sample_sc.asm
@ -0,0 +1,225 @@
+;*!
+;* \copy
+;*     Copyright (c)  2009-2013, Cisco Systems
+;*     All rights reserved.
+;*
+;*     Redistribution and use in source and binary forms, with or without
+;*     modification, are permitted provided that the following conditions
+;*     are met:
+;*
+;*        * Redistributions of source code must retain the above copyright
+;*          notice, this list of conditions and the following disclaimer.
+;*
+;*        * Redistributions in binary form must reproduce the above copyright
+;*          notice, this list of conditions and the following disclaimer in
+;*          the documentation and/or other materials provided with the
+;*          distribution.
+;*
+;*     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;*     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;*     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+;*     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+;*     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+;*     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+;*     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+;*     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+;*     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+;*     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+;*     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+;*     POSSIBILITY OF SUCH DAMAGE.
+;*
+;*************************************************************************/
+%include "asm_inc.asm"
+
+SECTION .text
+
+;**********************************************************************************************************************************
+;
+;	uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
+;
+;	\note:
+;		src need align with 16 bytes, ref is optional
+;	\return value:
+;		return minimal SAD cost, according index carried by index_min_cost
+;**********************************************************************************************************************************
+; try 8 mv via offset
+; xmm7 store sad costs
+%macro   SAD_16x16_LINE_SSE41  4	; src, ref, stride_src, stride_ref
+    movdqa		xmm0, [%1]
+    movdqu		xmm1, [%2]
+    movdqu		xmm2, [%2+8h]
+    movdqa		xmm3, xmm1
+    movdqa		xmm4, xmm2
+
+    mpsadbw		xmm1, xmm0, 0	; 000 B
+    paddw		xmm7, xmm1		; accumulate cost
+
+    mpsadbw		xmm3, xmm0, 5	; 101 B
+    paddw		xmm7, xmm3		; accumulate cost
+
+    mpsadbw		xmm2, xmm0, 2	; 010 B
+    paddw		xmm7, xmm2		; accumulate cost
+
+    mpsadbw		xmm4, xmm0, 7	; 111 B
+    paddw		xmm7, xmm4		; accumulate cost
+
+    add			%1, %3
+    add			%2, %4
+%endmacro	; end of SAD_16x16_LINE_SSE41
+%macro   SAD_16x16_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
+    movdqa		xmm0, [%1]
+    movdqu		xmm1, [%2]
+    movdqu		xmm2, [%2+8h]
+    movdqa		xmm3, xmm1
+    movdqa		xmm4, xmm2
+
+    mpsadbw		xmm1, xmm0, 0	; 000 B
+    paddw		xmm7, xmm1		; accumulate cost
+
+    mpsadbw		xmm3, xmm0, 5	; 101 B
+    paddw		xmm7, xmm3		; accumulate cost
+
+    mpsadbw		xmm2, xmm0, 2	; 010 B
+    paddw		xmm7, xmm2		; accumulate cost
+
+    mpsadbw		xmm4, xmm0, 7	; 111 B
+    paddw		xmm7, xmm4		; accumulate cost
+%endmacro	; end of SAD_16x16_LINE_SSE41E
+
+WELS_EXTERN SampleSad16x16Hor8_sse41
+    ;push ebx
+    ;push esi
+    ;mov eax, [esp+12]	;   src
+    ;mov ecx, [esp+16]	;   stride_src
+    ;mov ebx, [esp+20]	;   ref
+    ;mov edx, [esp+24]	;   stride_ref
+    ;mov esi, [esp+28]	;   base_cost
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION	r1, r1d
+    SIGN_EXTENSION	r3, r3d
+    pxor	xmm7,	xmm7
+
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41	r0, r2, r1, r3
+    SAD_16x16_LINE_SSE41E	r0, r2, r1, r3
+
+    pxor	xmm0,	xmm0
+    movdqa	xmm6,	xmm7
+    punpcklwd	xmm6,	xmm0
+    punpckhwd	xmm7,	xmm0
+
+    movdqa	xmm5,	[r4]
+    movdqa	xmm4,	xmm5
+    punpcklwd	xmm4,	xmm0
+    punpckhwd	xmm5,	xmm0
+
+    paddd	xmm4,	xmm6
+    paddd	xmm5,	xmm7
+    movdqa	xmm3,	xmm4
+    pminud	xmm3,	xmm5
+    pshufd	xmm2,	xmm3,	01001110B
+    pminud	xmm2,	xmm3
+    pshufd	xmm3,	xmm2,	10110001B
+    pminud	xmm2,	xmm3
+    movd	retrd,	xmm2
+    pcmpeqd	xmm4,	xmm2
+    movmskps	r2d, xmm4
+    bsf		r1d,	r2d
+    jnz	near WRITE_INDEX
+
+    pcmpeqd	xmm5,	xmm2
+    movmskps	r2d, xmm5
+    bsf		r1d,	r2d
+    add		r1d,	4
+
+WRITE_INDEX:
+    mov		[r5],	r1d
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
+;**********************************************************************************************************************************
+;
+;	uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
+;
+;	\note:
+;		src and ref is optional to align with 16 due inter 8x8
+;	\return value:
+;		return minimal SAD cost, according index carried by index_min_cost
+;
+;**********************************************************************************************************************************
+; try 8 mv via offset
+; xmm7 store sad costs
+%macro   SAD_8x8_LINE_SSE41  4	; src, ref, stride_src, stride_ref
+    movdqu		xmm0, [%1]
+    movdqu		xmm1, [%2]
+    movdqa		xmm2, xmm1
+
+    mpsadbw		xmm1, xmm0, 0	; 000 B
+    paddw		xmm7, xmm1		; accumulate cost
+
+    mpsadbw		xmm2, xmm0, 5	; 101 B
+    paddw		xmm7, xmm2		; accumulate cost
+
+    add			%1, %3
+    add			%2, %4
+%endmacro	; end of SAD_8x8_LINE_SSE41
+%macro   SAD_8x8_LINE_SSE41E  4	; src, ref, stride_src, stride_ref
+    movdqu		xmm0, [%1]
+    movdqu		xmm1, [%2]
+    movdqa		xmm2, xmm1
+
+    mpsadbw		xmm1, xmm0, 0	; 000 B
+    paddw		xmm7, xmm1		; accumulate cost
+
+    mpsadbw		xmm2, xmm0, 5	; 101 B
+    paddw		xmm7, xmm2		; accumulate cost
+%endmacro	; end of SAD_8x8_LINE_SSE41E
+
+WELS_EXTERN SampleSad8x8Hor8_sse41
+    %assign  push_num 0
+    LOAD_6_PARA
+    PUSH_XMM 8
+    SIGN_EXTENSION	r1, r1d
+    SIGN_EXTENSION	r3, r3d
+    movdqa xmm7, [r4]	;	load base cost list
+
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41	r0, r2, r1, r3
+    SAD_8x8_LINE_SSE41E	r0, r2, r1, r3
+
+    phminposuw	xmm0, xmm7	; horizon search the minimal sad cost and its index
+    movd	retrd, xmm0	; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
+    mov		r1d, retrd
+    and		retrd, 0xFFFF
+    sar		r1d, 16
+    mov		[r5], r1d
+
+    POP_XMM
+    LOAD_6_PARA_POP
+    ret
+
--- a/codec/encoder/targets.mk
+++ b/codec/encoder/targets.mk
@ -40,8 +40,10 @@ ENCODER_ASM_SRCS=\
 	$(ENCODER_SRCDIR)/core/x86/coeff.asm\
 	$(ENCODER_SRCDIR)/core/x86/dct.asm\
 	$(ENCODER_SRCDIR)/core/x86/intra_pred.asm\
+	$(ENCODER_SRCDIR)/core/x86/matrix_transpose.asm\
 	$(ENCODER_SRCDIR)/core/x86/memzero.asm\
 	$(ENCODER_SRCDIR)/core/x86/quant.asm\
+	$(ENCODER_SRCDIR)/core/x86/sample_sc.asm\
 	$(ENCODER_SRCDIR)/core/x86/score.asm\

 ENCODER_OBJS += $(ENCODER_ASM_SRCS:.asm=.$(OBJ))
--- a/test/encoder/EncUT_MotionEstimate.cpp
+++ b/test/encoder/EncUT_MotionEstimate.cpp
@ -5,6 +5,7 @@
 #include "sample.h"
 #include "svc_motion_estimate.h"
 #include "wels_func_ptr_def.h"
+#include "cpu.h"


 using namespace WelsSVCEnc;
@ -43,19 +44,21 @@ public:
    m_iMaxSearchBlock = 16;
    m_uiMvdTableSize	=  (1 + (648 << 1));

+    pMa = new CMemoryAlign(0);
    m_pRefPic = static_cast<uint8_t *>
-    (malloc(m_iWidth*m_iHeight));
+    (pMa->WelsMalloc(m_iWidth*m_iHeight, "RefPic"));
    ASSERT_TRUE( NULL != m_pRefPic );
    m_pSrcBlock = static_cast<uint8_t *>
-    (malloc(m_iMaxSearchBlock*m_iMaxSearchBlock));
+    (pMa->WelsMalloc(m_iMaxSearchBlock*m_iMaxSearchBlock, "SrcBlock"));
    ASSERT_TRUE( NULL != m_pSrcBlock );
    m_pMvdCostTable=new uint16_t[52*m_uiMvdTableSize];
    ASSERT_TRUE( NULL != m_pMvdCostTable );
  }
  virtual void TearDown() {
    delete [] m_pMvdCostTable;
-    free( m_pRefPic );
-    free( m_pSrcBlock );
+    pMa->WelsFree( m_pRefPic, "RefPic");
+    pMa->WelsFree( m_pSrcBlock, "SrcBlock");
+    delete pMa;
  }
 public:
  uint8_t *m_pRefPic;
@ -66,6 +69,7 @@ public:
  int32_t m_iWidth;
  int32_t m_iHeight;
  int32_t m_iMaxSearchBlock;
+  CMemoryAlign *pMa;
 };


@ -243,4 +247,134 @@ TEST_F(MotionEstimateTest, TestHorizontalSearch) {
    ASSERT_TRUE(iTryTimes > 0);
    //it is possible that ref at differnt position is identical, but that should be under a low probability
  }
-}
+}
+
+#ifdef X86_ASM
+TEST_F(MotionEstimateTest, TestVerticalSearch_SSE41)
+{
+  const int32_t kiMaxBlock16Sad = 72000;//a rough number
+  SWelsFuncPtrList sFuncList;
+  SWelsME sMe;
+
+  srand((uint32_t)time(NULL));
+  const uint8_t kuiQp = rand()%52;
+  InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
+
+  SMVUnitXY sTargetMv;
+  WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
+  WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
+
+  uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
+  sMe.iCurMeBlockPixX = (m_iWidth/2);
+  sMe.iCurMeBlockPixY = (m_iHeight/2);
+
+  bool bDataGeneratorSucceed = false;
+  bool bFoundMatch = false;
+  int32_t iTryTimes=100;
+
+  sTargetMv.iMvX = 0;
+  sTargetMv.iMvY = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iHeight-INTPEL_NEEDED_MARGIN);
+  bDataGeneratorSucceed = false;
+  bFoundMatch = false;
+  while (!bFoundMatch && (iTryTimes--)>0) {
+    if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
+      continue;
+
+    bDataGeneratorSucceed = true;
+    CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
+
+    //clean the sMe status
+    sMe.uiBlockSize = rand()%5;
+    sMe.pEncMb = m_pSrcBlock;
+    sMe.pRefMb = pRefPicCenter;
+    sMe.pColoRefMb = pRefPicCenter;
+    sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
+    sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
+    const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
+    const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
+    const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
+    const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
+    uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX;	//do the offset here
+    uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
+    VerticalFullSearchUsingSSE41 ( &sFuncList, &sMe,
+                      pMvdCostY, pMvdCostX[ iCurMeBlockQpelPixX ],
+                      m_iMaxSearchBlock, m_iWidth,
+                      INTPEL_NEEDED_MARGIN,
+                      m_iHeight-INTPEL_NEEDED_MARGIN, true );
+
+    //the last selection may be affected by MVDcost, that is when smaller MvY will be better
+    bFoundMatch = (sMe.sMv.iMvX==0
+                   &&(sMe.sMv.iMvY==sTargetMv.iMvY||abs(sMe.sMv.iMvY)<abs(sTargetMv.iMvY)));
+    //printf("TestVerticalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
+  }
+  if (bDataGeneratorSucceed) {
+    //if DataGenerator never succeed, there is no meaning to check iTryTimes
+    ASSERT_TRUE(iTryTimes > 0);
+    //it is possible that ref at differnt position is identical, but that should be under a low probability
+  }
+}
+
+TEST_F(MotionEstimateTest, TestHorizontalSearch_SSE41)
+{
+  const int32_t kiMaxBlock16Sad = 72000;//a rough number
+  SWelsFuncPtrList sFuncList;
+  SWelsME sMe;
+
+  srand((uint32_t)time(NULL));
+  const uint8_t kuiQp = rand()%52;
+  InitMe(kuiQp, 648, m_uiMvdTableSize, m_pMvdCostTable, &sMe);
+
+  SMVUnitXY sTargetMv;
+  WelsInitSampleSadFunc( &sFuncList, 0 );//test c functions
+  WelsInitMeFunc(&sFuncList, WELS_CPU_SSE41, 1);
+
+  uint8_t *pRefPicCenter = m_pRefPic+(m_iHeight/2)*m_iWidth+(m_iWidth/2);
+  sMe.iCurMeBlockPixX = (m_iWidth/2);
+  sMe.iCurMeBlockPixY = (m_iHeight/2);
+
+  bool bDataGeneratorSucceed = false;
+  bool bFoundMatch = false;
+  int32_t iTryTimes=100;
+
+  sTargetMv.iMvX = WELS_MAX(INTPEL_NEEDED_MARGIN, rand()%m_iWidth-INTPEL_NEEDED_MARGIN);
+  sTargetMv.iMvY = 0;
+  bDataGeneratorSucceed = false;
+  bFoundMatch = false;
+  while (!bFoundMatch && (iTryTimes--)>0) {
+    if (!YUVPixelDataGenerator( m_pRefPic, m_iWidth, m_iHeight, m_iWidth ))
+      continue;
+
+    bDataGeneratorSucceed = true;
+    CopyTargetBlock( m_pSrcBlock, 16, sTargetMv, m_iWidth, pRefPicCenter);
+
+    //clean the sMe status
+    sMe.uiBlockSize = rand()%5;
+    sMe.pEncMb = m_pSrcBlock;
+    sMe.pRefMb = pRefPicCenter;
+    sMe.pColoRefMb = pRefPicCenter;
+    sMe.sMv.iMvX = sMe.sMv.iMvY = 0;
+    sMe.uiSadCost = sMe.uiSatdCost = kiMaxBlock16Sad;
+    const int32_t iCurMeBlockPixX = sMe.iCurMeBlockPixX;
+    const int32_t iCurMeBlockQpelPixX = ((iCurMeBlockPixX)<<2);
+    const int32_t iCurMeBlockPixY = sMe.iCurMeBlockPixY;
+    const int32_t iCurMeBlockQpelPixY = ((iCurMeBlockPixY)<<2);
+    uint16_t* pMvdCostX = sMe.pMvdCost - iCurMeBlockQpelPixX - sMe.sMvp.iMvX;	//do the offset here
+    uint16_t* pMvdCostY = sMe.pMvdCost - iCurMeBlockQpelPixY - sMe.sMvp.iMvY;
+    HorizontalFullSearchUsingSSE41 ( &sFuncList, &sMe,
+                      pMvdCostX, pMvdCostY[ iCurMeBlockQpelPixY ],
+                      m_iMaxSearchBlock, m_iWidth,
+                      INTPEL_NEEDED_MARGIN,
+                      m_iWidth-INTPEL_NEEDED_MARGIN, false );
+
+    //the last selection may be affected by MVDcost, that is when smaller MvY will be better
+    bFoundMatch = (sMe.sMv.iMvY==0
+                   &&(sMe.sMv.iMvX==sTargetMv.iMvX||abs(sMe.sMv.iMvX)<abs(sTargetMv.iMvX)));
+    //printf("TestHorizontalSearch Target: %d,%d\n", sTargetMv.iMvX, sTargetMv.iMvY);
+  }
+  if (bDataGeneratorSucceed) {
+    //if DataGenerator never succeed, there is no meaning to check iTryTimes
+    ASSERT_TRUE(iTryTimes > 0);
+    //it is possible that ref at differnt position is identical, but that should be under a low probability
+  }
+}
+#endif