x86: hevc_mc: split differently calls
In some cases, 2 or 3 calls are performed to functions for unusual widths. Instead, perform 2 calls for different widths to split the workload. The 8+16 and 4+8 widths for respectively 8 and more than 8 bits can't be processed that way without modifications: some calls use unaligned buffers, and having branches to handle this was resulting in no micro-benchmark benefit. For block_w == 12 (around 1% of the pixels of the sequence): Before: 12758 decicycles in epel_uni, 4093 runs, 3 skips 19389 decicycles in qpel_uni, 8187 runs, 5 skips 22699 decicycles in epel_bi, 32743 runs, 25 skips 34736 decicycles in qpel_bi, 32733 runs, 35 skips After: 11929 decicycles in epel_uni, 4096 runs, 0 skips 18131 decicycles in qpel_uni, 8184 runs, 8 skips 20065 decicycles in epel_bi, 32750 runs, 18 skips 31458 decicycles in qpel_bi, 32753 runs, 15 skips Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
parent
38e2aa3759
commit
3e892b2bcd
@ -123,6 +123,45 @@ void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dst
|
|||||||
mc_rep_uni_func(name, bitd, step, W, opt); \
|
mc_rep_uni_func(name, bitd, step, W, opt); \
|
||||||
mc_rep_bi_func(name, bitd, step, W, opt)
|
mc_rep_bi_func(name, bitd, step, W, opt)
|
||||||
|
|
||||||
|
#define mc_rep_func2(name, bitd, step1, step2, W, opt) \
|
||||||
|
void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst, \
|
||||||
|
uint8_t *src, ptrdiff_t _srcstride, int height, \
|
||||||
|
intptr_t mx, intptr_t my, int width) \
|
||||||
|
{ \
|
||||||
|
ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
|
||||||
|
ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)), \
|
||||||
|
_srcstride, height, mx, my, width); \
|
||||||
|
}
|
||||||
|
#define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
|
||||||
|
void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, \
|
||||||
|
uint8_t *src, ptrdiff_t _srcstride, int height, \
|
||||||
|
intptr_t mx, intptr_t my, int width) \
|
||||||
|
{ \
|
||||||
|
ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
|
||||||
|
ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
|
||||||
|
src + (step1 * ((bitd + 7) / 8)), _srcstride, \
|
||||||
|
height, mx, my, width); \
|
||||||
|
}
|
||||||
|
#define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
|
||||||
|
void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
|
||||||
|
ptrdiff_t _srcstride, int16_t* src2, \
|
||||||
|
int height, intptr_t mx, intptr_t my, int width) \
|
||||||
|
{ \
|
||||||
|
ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
|
||||||
|
ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
|
||||||
|
src + (step1 * ((bitd + 7) / 8)), _srcstride, \
|
||||||
|
src2 + step1, height, mx, my, width); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define mc_rep_funcs(name, bitd, step, W, opt) \
|
||||||
|
mc_rep_func(name, bitd, step, W, opt); \
|
||||||
|
mc_rep_uni_func(name, bitd, step, W, opt); \
|
||||||
|
mc_rep_bi_func(name, bitd, step, W, opt)
|
||||||
|
|
||||||
|
#define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
|
||||||
|
mc_rep_func2(name, bitd, step1, step2, W, opt); \
|
||||||
|
mc_rep_uni_func2(name, bitd, step1, step2, W, opt); \
|
||||||
|
mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
|
||||||
|
|
||||||
#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
|
#if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
|
||||||
|
|
||||||
@ -180,7 +219,7 @@ mc_rep_funcs(epel_hv, 8, 8, 48, sse4);
|
|||||||
mc_rep_funcs(epel_hv, 8, 8, 32, sse4);
|
mc_rep_funcs(epel_hv, 8, 8, 32, sse4);
|
||||||
mc_rep_funcs(epel_hv, 8, 8, 24, sse4);
|
mc_rep_funcs(epel_hv, 8, 8, 24, sse4);
|
||||||
mc_rep_funcs(epel_hv, 8, 8, 16, sse4);
|
mc_rep_funcs(epel_hv, 8, 8, 16, sse4);
|
||||||
mc_rep_funcs(epel_hv, 8, 4, 12, sse4);
|
mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4);
|
||||||
mc_rep_funcs(epel_hv,10, 8, 64, sse4);
|
mc_rep_funcs(epel_hv,10, 8, 64, sse4);
|
||||||
mc_rep_funcs(epel_hv,10, 8, 48, sse4);
|
mc_rep_funcs(epel_hv,10, 8, 48, sse4);
|
||||||
mc_rep_funcs(epel_hv,10, 8, 32, sse4);
|
mc_rep_funcs(epel_hv,10, 8, 32, sse4);
|
||||||
@ -231,7 +270,7 @@ mc_rep_funcs(qpel_hv, 8, 8, 48, sse4);
|
|||||||
mc_rep_funcs(qpel_hv, 8, 8, 32, sse4);
|
mc_rep_funcs(qpel_hv, 8, 8, 32, sse4);
|
||||||
mc_rep_funcs(qpel_hv, 8, 8, 24, sse4);
|
mc_rep_funcs(qpel_hv, 8, 8, 24, sse4);
|
||||||
mc_rep_funcs(qpel_hv, 8, 8, 16, sse4);
|
mc_rep_funcs(qpel_hv, 8, 8, 16, sse4);
|
||||||
mc_rep_funcs(qpel_hv, 8, 4, 12, sse4);
|
mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4);
|
||||||
mc_rep_funcs(qpel_hv,10, 8, 64, sse4);
|
mc_rep_funcs(qpel_hv,10, 8, 64, sse4);
|
||||||
mc_rep_funcs(qpel_hv,10, 8, 48, sse4);
|
mc_rep_funcs(qpel_hv,10, 8, 48, sse4);
|
||||||
mc_rep_funcs(qpel_hv,10, 8, 32, sse4);
|
mc_rep_funcs(qpel_hv,10, 8, 32, sse4);
|
||||||
|
Loading…
Reference in New Issue
Block a user