Add 4-tap version of 2nd-pass ARMv6 MC filter.

The existing code applied a 6-tap filter with 0's on either end.
We're already paying the branch penalty to avoid computing the two
 extra columns needed as input to this filter.
We might as well save time computing the filter as well.
This reduces the inner loop from 21 instructions to 16, the number
 of loads per iteration from 4 to 1, and the number of multiplies
 from 7 to 4.
The gain in overall decoding performance, however, is small (less
 than 1%).

This change also means we now valgrind clean on ARMv6, which is
 its real purpose.
The errors reported here were valgrind's fault (it does not detect
 that 0 times an uninitialized value is initialized), but Julian
 Seward says it would slow down valgrind considerably to make such
 checks.
Speeding up libvpx rather, even by a small amount, seems a much
 better idea if only to enable proper valgrind checking of the
 rest of the codec.

Change-Id: Ifb376ea195e086b60f61daf1097d8910c4d8ff16
This commit is contained in:
Timothy B. Terriberry 2010-09-27 17:18:18 -07:00
parent 305be4e417
commit 18dc92fd66
2 changed files with 86 additions and 6 deletions

View File

@ -11,6 +11,7 @@
EXPORT |vp8_filter_block2d_first_pass_armv6|
EXPORT |vp8_filter_block2d_second_pass_armv6|
EXPORT |vp8_filter4_block2d_second_pass_armv6|
EXPORT |vp8_filter_block2d_first_pass_only_armv6|
EXPORT |vp8_filter_block2d_second_pass_only_armv6|
@ -192,6 +193,64 @@
ENDP
;---------------------------------
; r0 short *src_ptr,
; r1 unsigned char *output_ptr,
; r2 unsigned int output_pitch,
; r3 unsigned int cnt,
; stack const short *vp8_filter
;---------------------------------
|vp8_filter4_block2d_second_pass_armv6| PROC
stmdb sp!, {r4 - r11, lr}
ldr r11, [sp, #36] ; vp8_filter address
mov r7, r3, lsl #16 ; height is top part of counter
ldr r4, [r11] ; load up packed filter coefficients
add lr, r1, r3 ; save final destination pointer
ldr r5, [r11, #4]
ldr r6, [r11, #8]
pkhbt r12, r5, r4 ; pack the filter differently
pkhbt r11, r6, r5
mov r4, #0x40 ; rounding factor (for smlad{x})
|height_loop_2nd_4|
ldrd r8, [r0, #-4] ; load the data
orr r7, r7, r3, lsr #1 ; loop counter
|width_loop_2nd_4|
ldr r10, [r0, #4]!
smladx r6, r9, r12, r4 ; apply filter
pkhbt r8, r9, r8
smlad r5, r8, r12, r4
pkhbt r8, r10, r9
smladx r6, r10, r11, r6
sub r7, r7, #1
smlad r5, r8, r11, r5
mov r8, r9 ; shift the data for the next loop
mov r9, r10
usat r6, #8, r6, asr #7 ; shift and clamp
usat r5, #8, r5, asr #7
strb r5, [r1], r2 ; the result is transposed back and stored
tst r7, #0xff
strb r6, [r1], r2
bne width_loop_2nd_4
subs r7, r7, #0x10000
add r0, r0, #16 ; update src for next loop
sub r1, lr, r7, lsr #16 ; update dst for next loop
bne height_loop_2nd_4
ldmia sp!, {r4 - r11, pc}
ENDP
;------------------------------------
; r0 unsigned char *src_ptr
; r1 unsigned char *output_ptr,

View File

@ -50,6 +50,15 @@ extern void vp8_filter_block2d_second_pass_armv6
const short *vp8_filter
);
extern void vp8_filter4_block2d_second_pass_armv6
(
short *src_ptr,
unsigned char *output_ptr,
unsigned int output_pitch,
unsigned int cnt,
const short *vp8_filter
);
extern void vp8_filter_block2d_first_pass_only_armv6
(
unsigned char *src_ptr,
@ -107,12 +116,16 @@ void vp8_sixtap_predict_armv6
{
// Vfilter is a 4 tap filter
if (yoffset & 0x1)
{
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter);
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
}
// Vfilter is 6 tap filter
else
{
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter);
}
}
}
@ -186,11 +199,15 @@ void vp8_sixtap_predict8x8_armv6
else
{
if (yoffset & 0x1)
{
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter);
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
}
else
{
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter);
}
}
}
@ -224,11 +241,15 @@ void vp8_sixtap_predict16x16_armv6
else
{
if (yoffset & 0x1)
{
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter);
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
}
else
{
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter);
}
}
}