h264dsp: Factorize code into a new function, h264_find_start_code_candidate
This performs the start code search which was previously part of h264_find_frame_end() - the most CPU intensive part of the function. By itself, this results in a performance regression: Before After Mean StdDev Mean StdDev Change Overall time 2925.6 26.2 3068.5 31.7 -4.7% but this can more than be made up for by platform-optimised implementations of the function. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
parent
7a82022ee2
commit
218d6844b3
@ -47,30 +47,9 @@ static int h264_find_frame_end(H264Context *h, const uint8_t *buf,
|
||||
|
||||
for (i = 0; i < buf_size; i++) {
|
||||
if (state == 7) {
|
||||
#if HAVE_FAST_UNALIGNED
|
||||
/* we check i < buf_size instead of i + 3 / 7 because it is
|
||||
* simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
|
||||
* bytes at the end.
|
||||
*/
|
||||
#if HAVE_FAST_64BIT
|
||||
while (i < buf_size &&
|
||||
!((~*(const uint64_t *)(buf + i) &
|
||||
(*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
|
||||
0x8080808080808080ULL))
|
||||
i += 8;
|
||||
#else
|
||||
while (i < buf_size &&
|
||||
!((~*(const uint32_t *)(buf + i) &
|
||||
(*(const uint32_t *)(buf + i) - 0x01010101U)) &
|
||||
0x80808080U))
|
||||
i += 4;
|
||||
#endif
|
||||
#endif
|
||||
for (; i < buf_size; i++)
|
||||
if (!buf[i]) {
|
||||
state = 2;
|
||||
break;
|
||||
}
|
||||
i += h->h264dsp.h264_find_start_code_candidate(buf + i, buf_size - i);
|
||||
if (i < buf_size)
|
||||
state = 2;
|
||||
} else if (state <= 2) {
|
||||
if (buf[i] == 1)
|
||||
state ^= 5; // 2->7, 1->4, 0->5
|
||||
|
@ -53,6 +53,34 @@
|
||||
#include "h264addpx_template.c"
|
||||
#undef BIT_DEPTH
|
||||
|
||||
static int h264_find_start_code_candidate_c(const uint8_t *buf, int size)
|
||||
{
|
||||
int i = 0;
|
||||
#if HAVE_FAST_UNALIGNED
|
||||
/* we check i < size instead of i + 3 / 7 because it is
|
||||
* simpler and there must be FF_INPUT_BUFFER_PADDING_SIZE
|
||||
* bytes at the end.
|
||||
*/
|
||||
#if HAVE_FAST_64BIT
|
||||
while (i < size &&
|
||||
!((~*(const uint64_t *)(buf + i) &
|
||||
(*(const uint64_t *)(buf + i) - 0x0101010101010101ULL)) &
|
||||
0x8080808080808080ULL))
|
||||
i += 8;
|
||||
#else
|
||||
while (i < size &&
|
||||
!((~*(const uint32_t *)(buf + i) &
|
||||
(*(const uint32_t *)(buf + i) - 0x01010101U)) &
|
||||
0x80808080U))
|
||||
i += 4;
|
||||
#endif
|
||||
#endif
|
||||
for (; i < size; i++)
|
||||
if (!buf[i])
|
||||
break;
|
||||
return i;
|
||||
}
|
||||
|
||||
av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
@ -133,6 +161,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
|
||||
H264_DSP(8);
|
||||
break;
|
||||
}
|
||||
c->h264_find_start_code_candidate = h264_find_start_code_candidate_c;
|
||||
|
||||
if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
|
||||
if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
|
||||
|
@ -105,6 +105,15 @@ typedef struct H264DSPContext {
|
||||
/* bypass-transform */
|
||||
void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride);
|
||||
void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride);
|
||||
|
||||
/**
|
||||
* Search buf from the start for up to size bytes. Return the index
|
||||
* of a zero byte, or >= size if not found. Ideally, use lookahead
|
||||
* to filter out any zero bytes that are known to not be followed by
|
||||
* one or more further zero bytes and a one byte. Better still, filter
|
||||
* out any bytes that form the trailing_zero_8bits syntax element too.
|
||||
*/
|
||||
int (*h264_find_start_code_candidate)(const uint8_t *buf, int size);
|
||||
} H264DSPContext;
|
||||
|
||||
void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
|
||||
|
Loading…
x
Reference in New Issue
Block a user