diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm index 8bc6d7735..03b5bccd7 100644 --- a/vp8/common/arm/armv6/filter_v6.asm +++ b/vp8/common/arm/armv6/filter_v6.asm @@ -11,6 +11,7 @@ EXPORT |vp8_filter_block2d_first_pass_armv6| EXPORT |vp8_filter_block2d_second_pass_armv6| + EXPORT |vp8_filter4_block2d_second_pass_armv6| EXPORT |vp8_filter_block2d_first_pass_only_armv6| EXPORT |vp8_filter_block2d_second_pass_only_armv6| @@ -192,6 +193,64 @@ ENDP +;--------------------------------- +; r0 short *src_ptr, +; r1 unsigned char *output_ptr, +; r2 unsigned int output_pitch, +; r3 unsigned int cnt, +; stack const short *vp8_filter +;--------------------------------- +|vp8_filter4_block2d_second_pass_armv6| PROC + stmdb sp!, {r4 - r11, lr} + + ldr r11, [sp, #36] ; vp8_filter address + mov r7, r3, lsl #16 ; height is top part of counter + + ldr r4, [r11] ; load up packed filter coefficients + add lr, r1, r3 ; save final destination pointer + ldr r5, [r11, #4] + ldr r6, [r11, #8] + + pkhbt r12, r5, r4 ; pack the filter differently + pkhbt r11, r6, r5 + mov r4, #0x40 ; rounding factor (for smlad{x}) + +|height_loop_2nd_4| + ldrd r8, [r0, #-4] ; load the data + orr r7, r7, r3, lsr #1 ; loop counter + +|width_loop_2nd_4| + ldr r10, [r0, #4]! + smladx r6, r9, r12, r4 ; apply filter + pkhbt r8, r9, r8 + smlad r5, r8, r12, r4 + pkhbt r8, r10, r9 + smladx r6, r10, r11, r6 + sub r7, r7, #1 + smlad r5, r8, r11, r5 + + mov r8, r9 ; shift the data for the next loop + mov r9, r10 + + usat r6, #8, r6, asr #7 ; shift and clamp + usat r5, #8, r5, asr #7 + + strb r5, [r1], r2 ; the result is transposed back and stored + tst r7, #0xff + strb r6, [r1], r2 + + bne width_loop_2nd_4 + + subs r7, r7, #0x10000 + add r0, r0, #16 ; update src for next loop + sub r1, lr, r7, lsr #16 ; update dst for next loop + + bne height_loop_2nd_4 + + ldmia sp!, {r4 - r11, pc} + + ENDP + ;------------------------------------ ; r0 unsigned char *src_ptr ; r1 unsigned char *output_ptr, diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c index 5ed4f8094..4bfa3ab34 100644 --- a/vp8/common/arm/filter_arm.c +++ b/vp8/common/arm/filter_arm.c @@ -50,6 +50,15 @@ extern void vp8_filter_block2d_second_pass_armv6 const short *vp8_filter ); +extern void vp8_filter4_block2d_second_pass_armv6 +( + short *src_ptr, + unsigned char *output_ptr, + unsigned int output_pitch, + unsigned int cnt, + const short *vp8_filter +); + extern void vp8_filter_block2d_first_pass_only_armv6 ( unsigned char *src_ptr, @@ -107,12 +116,16 @@ void vp8_sixtap_predict_armv6 { // Vfilter is a 4 tap filter if (yoffset & 0x1) + { vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 4, 7, HFilter); + vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } // Vfilter is 6 tap filter else + { vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 4, 9, HFilter); - - vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, VFilter); + } } } @@ -186,11 +199,15 @@ void vp8_sixtap_predict8x8_armv6 else { if (yoffset & 0x1) + { vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 8, 11, HFilter); + vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } else + { vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, 13, HFilter); - - vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, VFilter); + } } } @@ -224,11 +241,15 @@ void vp8_sixtap_predict16x16_armv6 else { if (yoffset & 0x1) + { vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, FData + 1, src_pixels_per_line, 16, 19, HFilter); + vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } else + { vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, 21, HFilter); - - vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, VFilter); + } } }