[Decoder/x86] IDCT one block at a time with SSE2
At lower bitrates, it is overall faster to conditionally do one block at a time with SSE2 on Haswell and likely other common architectures. At higher bitrates, it is faster to use the wider routine that IDCTs four blocks at a time. To avoid potential performance regressions as compared to MMX, stick with single-block IDCTs with SSE2. There is still a performance advantage as compared to MMX because the single-block SSE2 routine is faster than the corresponding MMX routine. Stick with four blocks at a time with AVX2 for which that appears to be consistently faster on Haswell.
This commit is contained in:
parent
98042f1600
commit
b6c4a5447c
@ -509,21 +509,12 @@ WELS_EXTERN WelsDctFourT4_sse2
|
|||||||
LOAD_5_PARA_POP
|
LOAD_5_PARA_POP
|
||||||
ret
|
ret
|
||||||
|
|
||||||
;***********************************************************************
|
|
||||||
; void IdctFourResAddPred_sse2(uint8_t* pPred, int32_t iStride, const int16_t* pDct, const int8_t* pNzc);
|
|
||||||
;***********************************************************************
|
|
||||||
WELS_EXTERN IdctFourResAddPred_sse2
|
|
||||||
%assign push_num 0
|
|
||||||
LOAD_3_PARA_TO_5_PARA_IDCT
|
|
||||||
jmp prefixed(WelsIDctFourT4Rec_sse2.begin)
|
|
||||||
|
|
||||||
;***********************************************************************
|
;***********************************************************************
|
||||||
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
|
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
|
||||||
;***********************************************************************
|
;***********************************************************************
|
||||||
WELS_EXTERN WelsIDctFourT4Rec_sse2
|
WELS_EXTERN WelsIDctFourT4Rec_sse2
|
||||||
%assign push_num 0
|
%assign push_num 0
|
||||||
LOAD_5_PARA
|
LOAD_5_PARA
|
||||||
.begin:
|
|
||||||
PUSH_XMM 8
|
PUSH_XMM 8
|
||||||
SIGN_EXTENSION r1, r1d
|
SIGN_EXTENSION r1, r1d
|
||||||
SIGN_EXTENSION r3, r3d
|
SIGN_EXTENSION r3, r3d
|
||||||
|
@ -49,7 +49,6 @@ extern "C" {
|
|||||||
void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
|
void IdctResAddPred_mmx (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
|
||||||
void IdctResAddPred_sse2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
|
void IdctResAddPred_sse2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
|
||||||
void IdctResAddPred_avx2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
|
void IdctResAddPred_avx2 (uint8_t* pPred, const int32_t kiStride, int16_t* pRs);
|
||||||
void IdctFourResAddPred_sse2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);
|
|
||||||
void IdctFourResAddPred_avx2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);
|
void IdctFourResAddPred_avx2 (uint8_t* pPred, int32_t iStride, int16_t* pRs, const int8_t* pNzc);
|
||||||
#endif//X86_ASM
|
#endif//X86_ASM
|
||||||
|
|
||||||
|
@ -992,7 +992,7 @@ void InitPredFunc (PWelsDecoderContext pCtx, uint32_t uiCpuFlag) {
|
|||||||
}
|
}
|
||||||
if (uiCpuFlag & WELS_CPU_SSE2) {
|
if (uiCpuFlag & WELS_CPU_SSE2) {
|
||||||
pCtx->pIdctResAddPredFunc = IdctResAddPred_sse2;
|
pCtx->pIdctResAddPredFunc = IdctResAddPred_sse2;
|
||||||
pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_sse2;
|
pCtx->pIdctFourResAddPredFunc = IdctFourResAddPred_<IdctResAddPred_sse2>;
|
||||||
|
|
||||||
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_sse2;
|
pCtx->pGetI16x16LumaPredFunc[I16_PRED_DC] = WelsDecoderI16x16LumaPredDc_sse2;
|
||||||
pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_sse2;
|
pCtx->pGetI16x16LumaPredFunc[I16_PRED_P] = WelsDecoderI16x16LumaPredPlane_sse2;
|
||||||
|
@ -139,7 +139,6 @@ GENERATE_IDCTRESADDPRED (IdctResAddPred_c, 0)
|
|||||||
GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx, WELS_CPU_MMXEXT)
|
GENERATE_IDCTRESADDPRED (IdctResAddPred_mmx, WELS_CPU_MMXEXT)
|
||||||
GENERATE_IDCTRESADDPRED (IdctResAddPred_sse2, WELS_CPU_SSE2)
|
GENERATE_IDCTRESADDPRED (IdctResAddPred_sse2, WELS_CPU_SSE2)
|
||||||
GENERATE_IDCTRESADDPRED (IdctResAddPred_avx2, WELS_CPU_AVX2)
|
GENERATE_IDCTRESADDPRED (IdctResAddPred_avx2, WELS_CPU_AVX2)
|
||||||
GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_sse2, WELS_CPU_SSE2)
|
|
||||||
GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_avx2, WELS_CPU_AVX2)
|
GENERATE_IDCTFOURRESADDPRED (IdctFourResAddPred_avx2, WELS_CPU_AVX2)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user