Merge remote branch 'internal/upstream' into HEAD

This commit is contained in:
John Koleszar 2011-07-14 00:05:04 -04:00
commit 6901105e99
28 changed files with 1404 additions and 1785 deletions

View File

@ -54,9 +54,11 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_armv6;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_armv6;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_armv6;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_armv6;
rtcd->loopfilter.simple_mb_v =
vp8_loop_filter_simple_vertical_edge_armv6;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_armv6;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_armv6;
rtcd->loopfilter.simple_mb_h =
vp8_loop_filter_simple_horizontal_edge_armv6;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_armv6;
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;

View File

@ -53,14 +53,11 @@ count RN r5
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
;r2 const char *flimit,
;r2 const char *blimit,
;r3 const char *limit,
;stack const char *thresh,
;stack int count
;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
@ -72,14 +69,18 @@ count RN r5
sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3
ldr r4, [r2], #4 ; flimit
ldrb r4, [r2] ; blimit
ldr r10, [src], pstep ; p2
ldr r2, [r3], #4 ; limit
ldrb r2, [r3] ; limit
ldr r11, [src], pstep ; p1
uadd8 r4, r4, r4 ; flimit * 2
ldr r3, [r6], #4 ; thresh
orr r4, r4, r4, lsl #8
ldrb r3, [r6] ; thresh
orr r2, r2, r2, lsl #8
mov count, count, lsl #1 ; 4-in-parallel
uadd8 r4, r4, r2 ; flimit * 2 + limit
orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
|Hnext8|
; vp8_filter_mask() function
@ -275,14 +276,18 @@ count RN r5
sub sp, sp, #16 ; create temp buffer
ldr r9, [src], pstep ; p3
ldr r4, [r2], #4 ; flimit
ldrb r4, [r2] ; blimit
ldr r10, [src], pstep ; p2
ldr r2, [r3], #4 ; limit
ldrb r2, [r3] ; limit
ldr r11, [src], pstep ; p1
uadd8 r4, r4, r4 ; flimit * 2
ldr r3, [r6], #4 ; thresh
orr r4, r4, r4, lsl #8
ldrb r3, [r6] ; thresh
orr r2, r2, r2, lsl #8
mov count, count, lsl #1 ; 4-in-parallel
uadd8 r4, r4, r2 ; flimit * 2 + limit
orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
|MBHnext8|
@ -584,15 +589,19 @@ count RN r5
sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data
ldr r4, [r2], #4 ; flimit
ldrb r4, [r2] ; blimit
ldr r7, [src], pstep
ldr r2, [r3], #4 ; limit
ldrb r2, [r3] ; limit
ldr r8, [src], pstep
uadd8 r4, r4, r4 ; flimit * 2
ldr r3, [r12], #4 ; thresh
orr r4, r4, r4, lsl #8
ldrb r3, [r12] ; thresh
orr r2, r2, r2, lsl #8
ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel
uadd8 r4, r4, r2 ; flimit * 2 + limit
orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
|Vnext8|
@ -855,18 +864,22 @@ count RN r5
sub sp, sp, #16 ; create temp buffer
ldr r6, [src], pstep ; load source data
ldr r4, [r2], #4 ; flimit
ldrb r4, [r2] ; blimit
pld [src, #23]
ldr r7, [src], pstep
ldr r2, [r3], #4 ; limit
ldrb r2, [r3] ; limit
pld [src, #23]
ldr r8, [src], pstep
uadd8 r4, r4, r4 ; flimit * 2
ldr r3, [r12], #4 ; thresh
orr r4, r4, r4, lsl #8
ldrb r3, [r12] ; thresh
orr r2, r2, r2, lsl #8
pld [src, #23]
ldr lr, [src], pstep
mov count, count, lsl #1 ; 4-in-parallel
uadd8 r4, r4, r2 ; flimit * 2 + limit
orr r4, r4, r4, lsl #16
orr r3, r3, r3, lsl #8
orr r2, r2, r2, lsl #16
orr r3, r3, r3, lsl #16
|MBVnext8|
; vp8_filter_mask() function
@ -906,6 +919,7 @@ count RN r5
str lr, [sp, #8]
ldr lr, [src], pstep
TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12
ldr lr, [sp, #8] ; load back (f)limit accumulator
@ -954,6 +968,7 @@ count RN r5
beq mbvskip_filter ; skip filtering
;vp8_hevmask() function
;calculate high edge variance
@ -1121,6 +1136,7 @@ count RN r5
smlabb r8, r6, lr, r7
smlatb r6, r6, lr, r7
smlabb r9, r10, lr, r7
smlatb r10, r10, lr, r7
ssat r8, #8, r8, asr #7
ssat r6, #8, r6, asr #7

View File

@ -45,35 +45,28 @@
MEND
src RN r0
pstep RN r1
;r0 unsigned char *src_ptr,
;r1 int src_pixel_step,
;r2 const char *flimit,
;r3 const char *limit,
;stack const char *thresh,
;stack int count
; All 16 elements in flimit are equal. So, in the code, only one load is needed
; for flimit. Same applies to limit. thresh is not used in simple looopfilter
;r2 const char *blimit
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
ldr r12, [r3] ; limit
ldrb r12, [r2] ; blimit
ldr r3, [src, -pstep, lsl #1] ; p1
ldr r4, [src, -pstep] ; p0
ldr r5, [src] ; q0
ldr r6, [src, pstep] ; q1
ldr r7, [r2] ; flimit
orr r12, r12, r12, lsl #8 ; blimit
ldr r2, c0x80808080
ldr r9, [sp, #40] ; count for 8-in-parallel
uadd8 r7, r7, r7 ; flimit * 2
mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
uadd8 r12, r7, r12 ; flimit * 2 + limit
orr r12, r12, r12, lsl #16 ; blimit
mov r9, #4 ; double the count. we're doing 4 at a time
mov lr, #0 ; need 0 in a couple places
|simple_hnext8|
@ -148,34 +141,32 @@ pstep RN r1
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
ldr r12, [r2] ; r12: flimit
ldrb r12, [r2] ; r12: blimit
ldr r2, c0x80808080
ldr r7, [r3] ; limit
orr r12, r12, r12, lsl #8
; load soure data to r7, r8, r9, r10
ldrh r3, [src, #-2]
pld [src, #23] ; preload for next block
ldrh r4, [src], pstep
uadd8 r12, r12, r12 ; flimit * 2
orr r12, r12, r12, lsl #16
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
uadd8 r12, r12, r7 ; flimit * 2 + limit
pkhbt r7, r3, r4, lsl #16
ldrh r3, [src, #-2]
pld [src, #23]
ldrh r4, [src], pstep
ldr r11, [sp, #40] ; count (r11) for 8-in-parallel
pkhbt r8, r5, r6, lsl #16
ldrh r5, [src, #-2]
pld [src, #23]
ldrh r6, [src], pstep
mov r11, r11, lsl #1 ; 4-in-parallel
mov r11, #4 ; double the count. we're doing 4 at a time
|simple_vnext8|
; vp8_simple_filter_mask() function

View File

@ -9,30 +9,34 @@
*/
#include "vpx_ports/config.h"
#include <math.h>
#include "vpx_config.h"
#include "vp8/common/loopfilter.h"
#include "vp8/common/onyxc_int.h"
#if HAVE_ARMV6
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#endif
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_y_neon);
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_y_neon);
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_y_neon);
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_y_neon);
extern prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_neon);
extern prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_neon);
#if HAVE_ARMV7
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh);
typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
unsigned char blimit, unsigned char limit, unsigned char thresh,
unsigned char *v);
extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_neon;
extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_neon;
extern loop_filter_uvfunction vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
#endif
#if HAVE_ARMV6
/*ARMV6 loopfilter functions*/
@ -40,96 +44,72 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon;
void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, blimit);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
}
#endif
@ -139,83 +119,58 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig
void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
unsigned char mblim = *lfi->mblim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
}
void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_horizontal_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
unsigned char mblim = *lfi->mblim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, mblim, lim, hev_thr);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
}
void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_vertical_edge_uv_neon(u_ptr, uv_stride, mblim, lim, hev_thr, v_ptr);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
unsigned char blim = *lfi->blim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, blim, lim, hev_thr);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, blim, lim, hev_thr);
vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, blim, lim, hev_thr);
if (u_ptr)
vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
}
void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_uv_neon(u_ptr + 4 * uv_stride, uv_stride, blim, lim, hev_thr, v_ptr + 4 * uv_stride);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
unsigned char blim = *lfi->blim;
unsigned char lim = *lfi->lim;
unsigned char hev_thr = *lfi->hev_thr;
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, blim, lim, hev_thr);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, blim, lim, hev_thr);
vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, blim, lim, hev_thr);
if (u_ptr)
vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
}
void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4);
}
#endif

View File

@ -12,15 +12,17 @@
#ifndef LOOPFILTER_ARM_H
#define LOOPFILTER_ARM_H
#include "vpx_config.h"
#if HAVE_ARMV6
extern prototype_loopfilter_block(vp8_loop_filter_mbv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bv_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bh_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbvs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bvs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_mbhs_armv6);
extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
extern prototype_simple_loopfilter(vp8_loop_filter_bvs_armv6);
extern prototype_simple_loopfilter(vp8_loop_filter_bhs_armv6);
extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_armv6);
extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_armv6);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v
@ -36,28 +38,29 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_armv6);
#define vp8_lf_normal_b_h vp8_loop_filter_bh_armv6
#undef vp8_lf_simple_mb_v
#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_armv6
#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_armv6
#undef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_armv6
#undef vp8_lf_simple_mb_h
#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_armv6
#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_armv6
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_armv6
#endif
#endif
#endif /* !CONFIG_RUNTIME_CPU_DETECT */
#endif /* HAVE_ARMV6 */
#if HAVE_ARMV7
extern prototype_loopfilter_block(vp8_loop_filter_mbv_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bv_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bh_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbvs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bvs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_mbhs_neon);
extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
extern prototype_simple_loopfilter(vp8_loop_filter_mbvs_neon);
extern prototype_simple_loopfilter(vp8_loop_filter_bvs_neon);
extern prototype_simple_loopfilter(vp8_loop_filter_mbhs_neon);
extern prototype_simple_loopfilter(vp8_loop_filter_bhs_neon);
#if !CONFIG_RUNTIME_CPU_DETECT
#undef vp8_lf_normal_mb_v
@ -83,7 +86,8 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_neon);
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_neon
#endif
#endif
#endif /* !CONFIG_RUNTIME_CPU_DETECT */
#endif
#endif /* HAVE_ARMV7 */
#endif /* LOOPFILTER_ARM_H */

View File

@ -14,109 +14,97 @@
EXPORT |vp8_loop_filter_vertical_edge_y_neon|
EXPORT |vp8_loop_filter_vertical_edge_uv_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; flimit, limit, and thresh should be positive numbers.
; All 16 elements in these variables are equal.
; void vp8_loop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src
; r1 int pitch
; r2 const signed char *flimit
; r3 const signed char *limit
; sp const signed char *thresh,
; sp+4 int count (unused)
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_loop_filter_horizontal_edge_y_neon| PROC
stmdb sp!, {lr}
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r12, [sp, #4] ; load thresh pointer
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1
add r1, r1, r1
vld1.u8 {q3}, [r2], r1 ; p3
vld1.u8 {q4}, [r2], r1 ; p2
vld1.u8 {q5}, [r2], r1 ; p1
vld1.u8 {q6}, [r2], r1 ; p0
vld1.u8 {q7}, [r2], r1 ; q0
vld1.u8 {q8}, [r2], r1 ; q1
vld1.u8 {q9}, [r2], r1 ; q2
vld1.u8 {q10}, [r2] ; q3
vld1.s8 {d4[], d5[]}, [r12] ; thresh
sub r0, r0, r1, lsl #1
vdup.u8 q2, r3 ; duplicate thresh
vld1.u8 {q3}, [r2@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r2@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r2@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r2@128] ; q2
vld1.u8 {q10}, [r12@128] ; q3
sub r2, r2, r1, lsl #1
sub r12, r12, r1, lsl #1
bl vp8_loop_filter_neon
vst1.u8 {q5}, [r0], r1 ; store op1
vst1.u8 {q6}, [r0], r1 ; store op0
vst1.u8 {q7}, [r0], r1 ; store oq0
vst1.u8 {q8}, [r0], r1 ; store oq1
vst1.u8 {q5}, [r2@128], r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r2@128], r1 ; store oq0
vst1.u8 {q8}, [r12@128], r1 ; store oq1
ldmia sp!, {pc}
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_y_neon|
; void vp8_loop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_horizontal_edge_uv_neon| PROC
stmdb sp!, {lr}
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
ldr r12, [sp, #4] ; load thresh
ldr r2, [sp, #8] ; load v ptr
vdup.u8 q2, r12 ; duplicate thresh
sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines
vld1.u8 {d6}, [r3], r1 ; p3
vld1.u8 {d8}, [r3], r1 ; p2
vld1.u8 {d10}, [r3], r1 ; p1
vld1.u8 {d12}, [r3], r1 ; p0
vld1.u8 {d14}, [r3], r1 ; q0
vld1.u8 {d16}, [r3], r1 ; q1
vld1.u8 {d18}, [r3], r1 ; q2
vld1.u8 {d20}, [r3] ; q3
ldr r3, [sp, #4] ; load thresh pointer
sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d7}, [r12], r1 ; p3
vld1.u8 {d9}, [r12], r1 ; p2
vld1.u8 {d11}, [r12], r1 ; p1
vld1.u8 {d13}, [r12], r1 ; p0
vld1.u8 {d15}, [r12], r1 ; q0
vld1.u8 {d17}, [r12], r1 ; q1
vld1.u8 {d19}, [r12], r1 ; q2
vld1.u8 {d21}, [r12] ; q3
vld1.s8 {d4[], d5[]}, [r3] ; thresh
vld1.u8 {d6}, [r3@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r3@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r3@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r3@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r3@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r3@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r3@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r3@64] ; q3
vld1.u8 {d21}, [r12@64] ; q3
bl vp8_loop_filter_neon
sub r0, r0, r1, lsl #1
sub r2, r2, r1, lsl #1
vst1.u8 {d10}, [r0], r1 ; store u op1
vst1.u8 {d11}, [r2], r1 ; store v op1
vst1.u8 {d12}, [r0], r1 ; store u op0
vst1.u8 {d13}, [r2], r1 ; store v op0
vst1.u8 {d14}, [r0], r1 ; store u oq0
vst1.u8 {d15}, [r2], r1 ; store v oq0
vst1.u8 {d16}, [r0] ; store u oq1
vst1.u8 {d17}, [r2] ; store v oq1
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r2@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r2@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r2@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64] ; store u oq1
vst1.u8 {d17}, [r2@64] ; store v oq1
ldmia sp!, {pc}
pop {pc}
ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon|
; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
@ -124,39 +112,38 @@
; const signed char *limit,
; const signed char *thresh,
; int count)
; r0 unsigned char *src,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 int count (unused)
; r0 unsigned char *src
; r1 int pitch
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_loop_filter_vertical_edge_y_neon| PROC
stmdb sp!, {lr}
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
sub r2, r0, #4 ; src ptr down by 4 columns
sub r0, r0, #2 ; dst ptr
ldr r12, [sp, #4] ; load thresh pointer
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
vdup.u8 q1, r3 ; duplicate limit
sub r2, r0, #4 ; src ptr down by 4 columns
add r1, r1, r1
ldr r3, [sp, #4] ; load thresh
add r12, r2, r1, asr #1
vld1.u8 {d6}, [r2], r1 ; load first 8-line src data
vld1.u8 {d8}, [r2], r1
vld1.u8 {d6}, [r2], r1
vld1.u8 {d8}, [r12], r1
vld1.u8 {d10}, [r2], r1
vld1.u8 {d12}, [r2], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d14}, [r2], r1
vld1.u8 {d16}, [r2], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d18}, [r2], r1
vld1.u8 {d20}, [r2], r1
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.u8 {d20}, [r12], r1
vld1.u8 {d7}, [r2], r1 ; load second 8-line src data
vld1.u8 {d9}, [r2], r1
vld1.u8 {d9}, [r12], r1
vld1.u8 {d11}, [r2], r1
vld1.u8 {d13}, [r2], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d15}, [r2], r1
vld1.u8 {d17}, [r2], r1
vld1.u8 {d19}, [r2], r1
vld1.u8 {d21}, [r2]
vld1.u8 {d17}, [r12], r1
vld1.u8 {d19}, [r2]
vld1.u8 {d21}, [r12]
;transpose to 8x16 matrix
vtrn.32 q3, q7
@ -164,6 +151,8 @@
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r3 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
@ -178,28 +167,34 @@
vswp d12, d11
vswp d16, d13
sub r0, r0, #2 ; dst ptr
vswp d14, d12
vswp d16, d15
add r12, r0, r1, asr #1
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1
vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1
vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1
vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r0], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r0], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r0], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0], r1
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r0]
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1
ldmia sp!, {pc}
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1
vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1
vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1
vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1
vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1
vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1
vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12]
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_y_neon|
; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch
@ -209,38 +204,36 @@
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_loop_filter_vertical_edge_uv_neon| PROC
stmdb sp!, {lr}
sub r12, r0, #4 ; move u pointer down by 4 columns
vld1.s8 {d0[], d1[]}, [r2] ; flimit
vld1.s8 {d2[], d3[]}, [r3] ; limit
push {lr}
vdup.u8 q0, r2 ; duplicate blimit
sub r12, r0, #4 ; move u pointer down by 4 columns
ldr r2, [sp, #8] ; load v ptr
vld1.u8 {d6}, [r12], r1 ;load u data
vld1.u8 {d8}, [r12], r1
vld1.u8 {d10}, [r12], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d14}, [r12], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d18}, [r12], r1
vld1.u8 {d20}, [r12]
vdup.u8 q1, r3 ; duplicate limit
sub r3, r2, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r12], r1 ;load u data
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d8}, [r12], r1
vld1.u8 {d9}, [r3], r1
vld1.u8 {d10}, [r12], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d12}, [r12], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d14}, [r12], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d16}, [r12], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d18}, [r12], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d20}, [r12]
vld1.u8 {d21}, [r3]
ldr r12, [sp, #4] ; load thresh pointer
ldr r12, [sp, #4] ; load thresh
;transpose to 8x16 matrix
vtrn.32 q3, q7
@ -248,6 +241,8 @@
vtrn.32 q5, q9
vtrn.32 q6, q10
vdup.u8 q2, r12 ; duplicate thresh
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
@ -258,18 +253,16 @@
vtrn.8 q7, q8
vtrn.8 q9, q10
vld1.s8 {d4[], d5[]}, [r12] ; thresh
bl vp8_loop_filter_neon
sub r0, r0, #2
sub r2, r2, #2
vswp d12, d11
vswp d16, d13
vswp d14, d12
vswp d16, d15
sub r0, r0, #2
sub r2, r2, #2
;store op1, op0, oq0, oq1
vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1
@ -288,7 +281,7 @@
vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0]
vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2]
ldmia sp!, {pc}
pop {pc}
ENDP ; |vp8_loop_filter_vertical_edge_uv_neon|
; void vp8_loop_filter_neon();
@ -316,42 +309,44 @@
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q4, q10, q9 ; abs(q3 - q2)
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q3, q3, q4
vmax.u8 q15, q11, q12
vabd.u8 q9, q6, q7 ; abs(p0 - q0)
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1
vmax.u8 q15, q15, q3
vadd.u8 q0, q0, q0 ; flimit * 2
vadd.u8 q0, q0, q1 ; flimit * 2 + limit
vcge.u8 q15, q1, q15
vmov.u8 q10, #0x80 ; 0x80
vabd.u8 q2, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2
vshr.u8 q2, q2, #1 ; a = a / 2
vqadd.u8 q9, q9, q2 ; a = b + a
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
vmov.u8 q0, #0x80 ; 0x80
vcge.u8 q15, q1, q15
; vp8_filter() function
; convert to signed
veor q7, q7, q0 ; qs0
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
veor q8, q8, q0 ; qs1
veor q7, q7, q10 ; qs0
vshr.u8 q2, q2, #1 ; a = a / 2
veor q6, q6, q10 ; ps0
veor q5, q5, q10 ; ps1
vqadd.u8 q9, q9, q2 ; a = b + a
veor q8, q8, q10 ; qs1
vmov.u8 q10, #3 ; #3
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q11, d15, d13
vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1
vmovl.u8 q4, d20
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
@ -378,19 +373,20 @@
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1)
; outer tap adjustments: ++vp8_filter >> 1
vrshr.s8 q1, q1, #1
vbic q1, q1, q14 ; vp8_filter &= ~hev
vmov.u8 q0, #0x80 ; 0x80
vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter)
vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter)
veor q5, q13, q0 ; *op1 = u^0x80
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
veor q5, q13, q0 ; *op1 = u^0x80
veor q8, q12, q0 ; *oq1 = u^0x80
bx lr

View File

@ -9,99 +9,109 @@
;
EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
;EXPORT |vp8_loop_filter_simple_horizontal_edge_neon|
EXPORT |vp8_loop_filter_bhs_neon|
EXPORT |vp8_loop_filter_mbhs_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;Note: flimit, limit, and thresh shpuld be positive numbers. All 16 elements in flimit
;are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
; r0 unsigned char *s,
; r1 int p, //pitch
; r2 const signed char *flimit,
; r3 const signed char *limit,
; stack(r4) const signed char *thresh (unused)
; //stack(r5) int count --unused
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
|vp8_loop_filter_simple_horizontal_edge_neon| PROC
sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines
vld1.u8 {q5}, [r0], r1 ; p1
vld1.s8 {d2[], d3[]}, [r2] ; flimit
vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
vld1.u8 {q6}, [r0], r1 ; p0
vmov.u8 q0, #0x80 ; 0x80
vld1.u8 {q7}, [r0], r1 ; q0
vmov.u8 q10, #0x03 ; 0x03
vld1.u8 {q8}, [r0] ; q1
sub r3, r0, r1, lsl #1 ; move src pointer down by 2 lines
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q5}, [r3@128], r1 ; p0
vld1.u8 {q8}, [r0@128] ; q1
vld1.u8 {q6}, [r3@128] ; p1
;vp8_filter_mask() function
vabd.u8 q15, q6, q7 ; abs(p0 - q0)
vabd.u8 q14, q5, q8 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q13, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
;vp8_filter() function
veor q7, q7, q0 ; qs0: q0 offset to convert to a signed value
veor q6, q6, q0 ; ps0: p0 offset to convert to a signed value
veor q5, q5, q0 ; ps1: p1 offset to convert to a signed value
veor q8, q8, q0 ; qs1: q1 offset to convert to a signed value
vadd.u8 q1, q1, q1 ; flimit * 2
vadd.u8 q1, q1, q13 ; flimit * 2 + limit
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
vcge.u8 q15, q1, q15 ; (abs(p0 - q0)*2 + abs(p1-q1)/2 > limit)*-1
;;;;;;;;;;
;vqsub.s8 q2, q7, q6 ; ( qs0 - ps0)
vsubl.s8 q2, d14, d12 ; ( qs0 - ps0)
vsubl.s8 q3, d15, d13
vqsub.s8 q4, q5, q8 ; q4: vp8_filter = vp8_signed_char_clamp(ps1-qs1)
;vmul.i8 q2, q2, q10 ; 3 * ( qs0 - ps0)
vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0)
vadd.s16 q12, q3, q3
vmul.s16 q2, q2, q13 ; 3 * ( qs0 - ps0)
vmul.s16 q3, q3, q13
vmov.u8 q10, #0x03 ; 0x03
vmov.u8 q9, #0x04 ; 0x04
vadd.s16 q2, q2, q11
vadd.s16 q3, q3, q12
vaddw.s8 q2, q2, d8 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q3, q3, d9
;vqadd.s8 q4, q4, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d8, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d9, q3
;;;;;;;;;;;;;
vand q4, q4, q15 ; vp8_filter &= mask
vand q14, q4, q15 ; vp8_filter &= mask
vqadd.s8 q2, q4, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q4, q4, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vqadd.s8 q2, q14, q10 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q3, q14, q9 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q4, q4, #3 ; Filter1 >>= 3
vshr.s8 q4, q3, #3 ; Filter1 >>= 3
sub r0, r0, r1, lsl #1
sub r0, r0, r1
;calculate output
vqadd.s8 q11, q6, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q7, q4 ; u = vp8_signed_char_clamp(qs0 - Filter1)
add r3, r0, r1
veor q6, q11, q0 ; *op0 = u^0x80
veor q7, q10, q0 ; *oq0 = u^0x80
vst1.u8 {q6}, [r0] ; store op0
vst1.u8 {q7}, [r3] ; store oq0
vst1.u8 {q6}, [r3@128] ; store op0
vst1.u8 {q7}, [r0@128] ; store oq0
bx lr
ENDP ; |vp8_loop_filter_simple_horizontal_edge_neon|
;-----------------
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_bhs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate blim
add r0, r0, r1, lsl #2 ; src = y_ptr + 4 * y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
; vp8_loop_filter_simple_horizontal_edge_neon preserves r0, r1 and q1
add r0, r0, r1, lsl #2 ; src = y_ptr + 8* y_stride
bl vp8_loop_filter_simple_horizontal_edge_neon
add r0, r0, r1, lsl #2 ; src = y_ptr + 12 * y_stride
pop {r4, lr}
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_mbhs_neon| PROC
ldrb r3, [r2] ; load blim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp8_loop_filter_simple_horizontal_edge_neon
ENDP ;|vp8_loop_filter_bhs_neon|
END

View File

@ -9,59 +9,54 @@
;
EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
;EXPORT |vp8_loop_filter_simple_vertical_edge_neon|
EXPORT |vp8_loop_filter_bvs_neon|
EXPORT |vp8_loop_filter_mbvs_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
;Note: flimit, limit, and thresh should be positive numbers. All 16 elements in flimit
;are equal. So, in the code, only one load is needed
;for flimit. Same way applies to limit and thresh.
; r0 unsigned char *s,
; r1 int p, //pitch
; r2 const signed char *flimit,
; r3 const signed char *limit,
; stack(r4) const signed char *thresh (unused)
; //stack(r5) int count --unused
; r0 unsigned char *s, PRESERVE
; r1 int p, PRESERVE
; q1 limit, PRESERVE
|vp8_loop_filter_simple_vertical_edge_neon| PROC
sub r0, r0, #2 ; move src pointer down by 2 columns
add r12, r1, r1
add r3, r0, r1
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r1
vld1.s8 {d2[], d3[]}, [r2] ; flimit
vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r0], r1
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r1
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1
vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12
vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12
vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12
vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12
vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12
vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12
vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12
vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1
vmov.u8 q0, #0x80 ; 0x80
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1
vmov.u8 q11, #0x03 ; 0x03
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1
vmov.u8 q12, #0x04 ; 0x04
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1
vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12
vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12
vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12
vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12
vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12
vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12
vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12
vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3]
vswp d7, d10
vswp d12, d9
;vswp q4, q5 ; p1:q3, p0:q5, q0:q4, q1:q6
;vp8_filter_mask() function
;vp8_hevmask() function
sub r0, r0, r1, lsl #4
vabd.u8 q15, q5, q4 ; abs(p0 - q0)
vabd.u8 q14, q3, q6 ; abs(p1 - q1)
vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2
vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2
vmov.u8 q0, #0x80 ; 0x80
vmov.s16 q11, #3
vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value
@ -69,80 +64,91 @@
veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value
veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value
vadd.u8 q1, q1, q1 ; flimit * 2
vadd.u8 q1, q1, q13 ; flimit * 2 + limit
vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1
;vp8_filter() function
;;;;;;;;;;
;vqsub.s8 q2, q5, q4 ; ( qs0 - ps0)
vsubl.s8 q2, d8, d10 ; ( qs0 - ps0)
vsubl.s8 q13, d9, d11
vqsub.s8 q1, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1)
;vmul.i8 q2, q2, q11 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vadd.s16 q10, q2, q2 ; 3 * ( qs0 - ps0)
vadd.s16 q14, q13, q13
vadd.s16 q2, q2, q10
vadd.s16 q13, q13, q14
vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vmul.s16 q13, q13, q11
;vqadd.s8 q1, q1, q2
vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d3
vmov.u8 q11, #0x03 ; 0x03
vmov.u8 q12, #0x04 ; 0x04
vqmovn.s16 d2, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d3, q13
vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0)
vaddw.s8 q13, q13, d29
vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
vqmovn.s16 d29, q13
add r0, r0, #1
add r2, r0, r1
;;;;;;;;;;;
add r3, r0, r1
vand q1, q1, q15 ; vp8_filter &= mask
vand q14, q14, q15 ; vp8_filter &= mask
vqadd.s8 q2, q1, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q1, q1, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3)
vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4)
vshr.s8 q2, q2, #3 ; Filter2 >>= 3
vshr.s8 q1, q1, #3 ; Filter1 >>= 3
vshr.s8 q14, q3, #3 ; Filter1 >>= 3
;calculate output
vqsub.s8 q10, q4, q1 ; u = vp8_signed_char_clamp(qs0 - Filter1)
vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2)
vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1)
veor q7, q10, q0 ; *oq0 = u^0x80
veor q6, q11, q0 ; *op0 = u^0x80
add r3, r2, r1
veor q7, q10, q0 ; *oq0 = u^0x80
add r12, r1, r1
vswp d13, d14
add r12, r3, r1
;store op1, op0, oq0, oq1
vst2.8 {d12[0], d13[0]}, [r0]
vst2.8 {d12[1], d13[1]}, [r2]
vst2.8 {d12[2], d13[2]}, [r3]
vst2.8 {d12[3], d13[3]}, [r12], r1
add r0, r12, r1
vst2.8 {d12[4], d13[4]}, [r12]
vst2.8 {d12[5], d13[5]}, [r0], r1
add r2, r0, r1
vst2.8 {d12[6], d13[6]}, [r0]
vst2.8 {d12[7], d13[7]}, [r2], r1
add r3, r2, r1
vst2.8 {d14[0], d15[0]}, [r2]
vst2.8 {d14[1], d15[1]}, [r3], r1
add r12, r3, r1
vst2.8 {d14[2], d15[2]}, [r3]
vst2.8 {d14[3], d15[3]}, [r12], r1
add r0, r12, r1
vst2.8 {d14[4], d15[4]}, [r12]
vst2.8 {d14[5], d15[5]}, [r0], r1
add r2, r0, r1
vst2.8 {d14[6], d15[6]}, [r0]
vst2.8 {d14[7], d15[7]}, [r2]
vst2.8 {d12[0], d13[0]}, [r0], r12
vst2.8 {d12[1], d13[1]}, [r3], r12
vst2.8 {d12[2], d13[2]}, [r0], r12
vst2.8 {d12[3], d13[3]}, [r3], r12
vst2.8 {d12[4], d13[4]}, [r0], r12
vst2.8 {d12[5], d13[5]}, [r3], r12
vst2.8 {d12[6], d13[6]}, [r0], r12
vst2.8 {d12[7], d13[7]}, [r3], r12
vst2.8 {d14[0], d15[0]}, [r0], r12
vst2.8 {d14[1], d15[1]}, [r3], r12
vst2.8 {d14[2], d15[2]}, [r0], r12
vst2.8 {d14[3], d15[3]}, [r3], r12
vst2.8 {d14[4], d15[4]}, [r0], r12
vst2.8 {d14[5], d15[5]}, [r3], r12
vst2.8 {d14[6], d15[6]}, [r0], r12
vst2.8 {d14[7], d15[7]}, [r3]
bx lr
ENDP ; |vp8_loop_filter_simple_vertical_edge_neon|
;-----------------
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_bvs_neon| PROC
push {r4, lr}
ldrb r3, [r2] ; load blim from mem
mov r4, r0
add r0, r0, #4
vdup.s8 q1, r3 ; duplicate blim
bl vp8_loop_filter_simple_vertical_edge_neon
; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1
add r0, r4, #8
bl vp8_loop_filter_simple_vertical_edge_neon
add r0, r4, #12
pop {r4, lr}
b vp8_loop_filter_simple_vertical_edge_neon
ENDP ;|vp8_loop_filter_bvs_neon|
; r0 unsigned char *y
; r1 int ystride
; r2 const unsigned char *blimit
|vp8_loop_filter_mbvs_neon| PROC
ldrb r3, [r2] ; load mblim from mem
vdup.s8 q1, r3 ; duplicate mblim
b vp8_loop_filter_simple_vertical_edge_neon
ENDP ;|vp8_loop_filter_bvs_neon|
END

View File

@ -14,155 +14,143 @@
EXPORT |vp8_mbloop_filter_vertical_edge_y_neon|
EXPORT |vp8_mbloop_filter_vertical_edge_uv_neon|
ARM
REQUIRE8
PRESERVE8
AREA ||.text||, CODE, READONLY, ALIGN=2
; flimit, limit, and thresh should be positive numbers.
; All 16 elements in these variables are equal.
; void vp8_mbloop_filter_horizontal_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 int count (unused)
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_mbloop_filter_horizontal_edge_y_neon| PROC
stmdb sp!, {lr}
sub r0, r0, r1, lsl #2 ; move src pointer down by 4 lines
ldr r12, [sp, #4] ; load thresh pointer
push {lr}
add r1, r1, r1 ; double stride
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #1 ; move src pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
add r12, r0, r1, lsr #1 ; move src pointer up by 1 line
vld1.u8 {q3}, [r0], r1 ; p3
vld1.s8 {d2[], d3[]}, [r3] ; limit
vld1.u8 {q4}, [r0], r1 ; p2
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.u8 {q5}, [r0], r1 ; p1
vld1.u8 {q6}, [r0], r1 ; p0
vld1.u8 {q7}, [r0], r1 ; q0
vld1.u8 {q8}, [r0], r1 ; q1
vld1.u8 {q9}, [r0], r1 ; q2
vld1.u8 {q10}, [r0], r1 ; q3
vld1.u8 {q3}, [r0@128], r1 ; p3
vld1.u8 {q4}, [r12@128], r1 ; p2
vld1.u8 {q5}, [r0@128], r1 ; p1
vld1.u8 {q6}, [r12@128], r1 ; p0
vld1.u8 {q7}, [r0@128], r1 ; q0
vld1.u8 {q8}, [r12@128], r1 ; q1
vld1.u8 {q9}, [r0@128], r1 ; q2
vld1.u8 {q10}, [r12@128], r1 ; q3
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
add r0, r0, r1
add r2, r0, r1
add r3, r2, r1
sub r12, r12, r1, lsl #2
add r0, r12, r1, lsr #1
vst1.u8 {q4}, [r0] ; store op2
vst1.u8 {q5}, [r2] ; store op1
vst1.u8 {q6}, [r3], r1 ; store op0
add r12, r3, r1
vst1.u8 {q7}, [r3] ; store oq0
vst1.u8 {q8}, [r12], r1 ; store oq1
vst1.u8 {q9}, [r12] ; store oq2
vst1.u8 {q4}, [r12@128],r1 ; store op2
vst1.u8 {q5}, [r0@128],r1 ; store op1
vst1.u8 {q6}, [r12@128], r1 ; store op0
vst1.u8 {q7}, [r0@128],r1 ; store oq0
vst1.u8 {q8}, [r12@128] ; store oq1
vst1.u8 {q9}, [r0@128] ; store oq2
ldmia sp!, {pc}
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_y_neon|
; void vp8_mbloop_filter_horizontal_edge_uv_neon(unsigned char *u, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
stmdb sp!, {lr}
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r3, [sp, #8] ; load v ptr
ldr r12, [sp, #4] ; load thresh pointer
sub r3, r3, r1, lsl #2 ; move v pointer down by 4 lines
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, r1, lsl #2 ; move u pointer down by 4 lines
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, r1, lsl #2 ; move v pointer down by 4 lines
vld1.u8 {d6}, [r0], r1 ; p3
vld1.u8 {d7}, [r3], r1 ; p3
vld1.u8 {d8}, [r0], r1 ; p2
vld1.u8 {d9}, [r3], r1 ; p2
vld1.u8 {d10}, [r0], r1 ; p1
vld1.u8 {d11}, [r3], r1 ; p1
vld1.u8 {d12}, [r0], r1 ; p0
vld1.u8 {d13}, [r3], r1 ; p0
vld1.u8 {d14}, [r0], r1 ; q0
vld1.u8 {d15}, [r3], r1 ; q0
vld1.u8 {d16}, [r0], r1 ; q1
vld1.u8 {d17}, [r3], r1 ; q1
vld1.u8 {d18}, [r0], r1 ; q2
vld1.u8 {d19}, [r3], r1 ; q2
vld1.u8 {d20}, [r0], r1 ; q3
vld1.u8 {d21}, [r3], r1 ; q3
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.u8 {d6}, [r0@64], r1 ; p3
vld1.u8 {d7}, [r12@64], r1 ; p3
vld1.u8 {d8}, [r0@64], r1 ; p2
vld1.u8 {d9}, [r12@64], r1 ; p2
vld1.u8 {d10}, [r0@64], r1 ; p1
vld1.u8 {d11}, [r12@64], r1 ; p1
vld1.u8 {d12}, [r0@64], r1 ; p0
vld1.u8 {d13}, [r12@64], r1 ; p0
vld1.u8 {d14}, [r0@64], r1 ; q0
vld1.u8 {d15}, [r12@64], r1 ; q0
vld1.u8 {d16}, [r0@64], r1 ; q1
vld1.u8 {d17}, [r12@64], r1 ; q1
vld1.u8 {d18}, [r0@64], r1 ; q2
vld1.u8 {d19}, [r12@64], r1 ; q2
vld1.u8 {d20}, [r0@64], r1 ; q3
vld1.u8 {d21}, [r12@64], r1 ; q3
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
sub r3, r3, r1, lsl #3
sub r12, r12, r1, lsl #3
add r0, r0, r1
add r3, r3, r1
add r12, r12, r1
vst1.u8 {d8}, [r0], r1 ; store u op2
vst1.u8 {d9}, [r3], r1 ; store v op2
vst1.u8 {d10}, [r0], r1 ; store u op1
vst1.u8 {d11}, [r3], r1 ; store v op1
vst1.u8 {d12}, [r0], r1 ; store u op0
vst1.u8 {d13}, [r3], r1 ; store v op0
vst1.u8 {d14}, [r0], r1 ; store u oq0
vst1.u8 {d15}, [r3], r1 ; store v oq0
vst1.u8 {d16}, [r0], r1 ; store u oq1
vst1.u8 {d17}, [r3], r1 ; store v oq1
vst1.u8 {d18}, [r0], r1 ; store u oq2
vst1.u8 {d19}, [r3], r1 ; store v oq2
vst1.u8 {d8}, [r0@64], r1 ; store u op2
vst1.u8 {d9}, [r12@64], r1 ; store v op2
vst1.u8 {d10}, [r0@64], r1 ; store u op1
vst1.u8 {d11}, [r12@64], r1 ; store v op1
vst1.u8 {d12}, [r0@64], r1 ; store u op0
vst1.u8 {d13}, [r12@64], r1 ; store v op0
vst1.u8 {d14}, [r0@64], r1 ; store u oq0
vst1.u8 {d15}, [r12@64], r1 ; store v oq0
vst1.u8 {d16}, [r0@64], r1 ; store u oq1
vst1.u8 {d17}, [r12@64], r1 ; store v oq1
vst1.u8 {d18}, [r0@64], r1 ; store u oq2
vst1.u8 {d19}, [r12@64], r1 ; store v oq2
ldmia sp!, {pc}
pop {pc}
ENDP ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
; void vp8_mbloop_filter_vertical_edge_y_neon(unsigned char *src, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; int count)
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh)
; r0 unsigned char *src,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 int count (unused)
; r2 unsigned char blimit
; r3 unsigned char limit
; sp unsigned char thresh,
|vp8_mbloop_filter_vertical_edge_y_neon| PROC
stmdb sp!, {lr}
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move src pointer down by 4 columns
vdup.s8 q2, r12 ; thresh
add r12, r0, r1, lsl #3 ; move src pointer down by 8 lines
vld1.u8 {d6}, [r0], r1 ; load first 8-line src data
ldr r12, [sp, #4] ; load thresh pointer
vld1.u8 {d7}, [r12], r1 ; load second 8-line src data
vld1.u8 {d8}, [r0], r1
sub sp, sp, #32
vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d7}, [r0], r1 ; load second 8-line src data
vld1.u8 {d9}, [r0], r1
vld1.u8 {d11}, [r0], r1
vld1.u8 {d13}, [r0], r1
vld1.u8 {d15}, [r0], r1
vld1.u8 {d17}, [r0], r1
vld1.u8 {d19}, [r0], r1
vld1.u8 {d21}, [r0], r1
vld1.u8 {d21}, [r12], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
@ -180,133 +168,11 @@
vtrn.8 q7, q8
vtrn.8 q9, q10
vld1.s8 {d4[], d5[]}, [r12] ; thresh
vld1.s8 {d2[], d3[]}, [r3] ; limit
mov r12, sp
vst1.u8 {q3}, [r12]!
vst1.u8 {q10}, [r12]!
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #4
add r2, r0, r1
add r3, r2, r1
vld1.u8 {q3}, [sp]!
vld1.u8 {q10}, [sp]!
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
add r12, r3, r1
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0]
vst1.8 {d8}, [r2]
vst1.8 {d10}, [r3]
vst1.8 {d12}, [r12], r1
add r0, r12, r1
vst1.8 {d14}, [r12]
vst1.8 {d16}, [r0], r1
add r2, r0, r1
vst1.8 {d18}, [r0]
vst1.8 {d20}, [r2], r1
add r3, r2, r1
vst1.8 {d7}, [r2]
vst1.8 {d9}, [r3], r1
add r12, r3, r1
vst1.8 {d11}, [r3]
vst1.8 {d13}, [r12], r1
add r0, r12, r1
vst1.8 {d15}, [r12]
vst1.8 {d17}, [r0], r1
add r2, r0, r1
vst1.8 {d19}, [r0]
vst1.8 {d21}, [r2]
ldmia sp!, {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
; const signed char *flimit,
; const signed char *limit,
; const signed char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
stmdb sp!, {lr}
sub r0, r0, #4 ; move src pointer down by 4 columns
vld1.s8 {d2[], d3[]}, [r3] ; limit
ldr r3, [sp, #8] ; load v ptr
ldr r12, [sp, #4] ; load thresh pointer
sub r3, r3, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r0], r1 ;load u data
vld1.u8 {d7}, [r3], r1 ;load v data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r3], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r3], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r3], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r3], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r3], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r3], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r3], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
sub sp, sp, #32
vld1.s8 {d4[], d5[]}, [r12] ; thresh
mov r12, sp
vst1.u8 {q3}, [r12]!
vst1.u8 {q10}, [r12]!
bl vp8_mbloop_filter_neon
sub r0, r0, r1, lsl #3
sub r3, r3, r1, lsl #3
vld1.u8 {q3}, [sp]!
vld1.u8 {q10}, [sp]!
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
;transpose to 16x8 matrix
vtrn.32 q3, q7
@ -326,23 +192,118 @@
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r3], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r3], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r3], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r3], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r3], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r3], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r3], r1
vst1.8 {d20}, [r0], r1
vst1.8 {d21}, [r3], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
ldmia sp!, {pc}
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_y_neon|
; void vp8_mbloop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch,
; const unsigned char *blimit,
; const unsigned char *limit,
; const unsigned char *thresh,
; unsigned char *v)
; r0 unsigned char *u,
; r1 int pitch,
; r2 const signed char *flimit,
; r3 const signed char *limit,
; sp const signed char *thresh,
; sp+4 unsigned char *v
|vp8_mbloop_filter_vertical_edge_uv_neon| PROC
push {lr}
ldr r12, [sp, #4] ; load thresh
sub r0, r0, #4 ; move u pointer down by 4 columns
vdup.u8 q2, r12 ; thresh
ldr r12, [sp, #8] ; load v ptr
sub r12, r12, #4 ; move v pointer down by 4 columns
vld1.u8 {d6}, [r0], r1 ;load u data
vld1.u8 {d7}, [r12], r1 ;load v data
vld1.u8 {d8}, [r0], r1
vld1.u8 {d9}, [r12], r1
vld1.u8 {d10}, [r0], r1
vld1.u8 {d11}, [r12], r1
vld1.u8 {d12}, [r0], r1
vld1.u8 {d13}, [r12], r1
vld1.u8 {d14}, [r0], r1
vld1.u8 {d15}, [r12], r1
vld1.u8 {d16}, [r0], r1
vld1.u8 {d17}, [r12], r1
vld1.u8 {d18}, [r0], r1
vld1.u8 {d19}, [r12], r1
vld1.u8 {d20}, [r0], r1
vld1.u8 {d21}, [r12], r1
;transpose to 8x16 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
sub r0, r0, r1, lsl #3
bl vp8_mbloop_filter_neon
sub r12, r12, r1, lsl #3
;transpose to 16x8 matrix
vtrn.32 q3, q7
vtrn.32 q4, q8
vtrn.32 q5, q9
vtrn.32 q6, q10
vtrn.16 q3, q5
vtrn.16 q4, q6
vtrn.16 q7, q9
vtrn.16 q8, q10
vtrn.8 q3, q4
vtrn.8 q5, q6
vtrn.8 q7, q8
vtrn.8 q9, q10
;store op2, op1, op0, oq0, oq1, oq2
vst1.8 {d6}, [r0], r1
vst1.8 {d7}, [r12], r1
vst1.8 {d8}, [r0], r1
vst1.8 {d9}, [r12], r1
vst1.8 {d10}, [r0], r1
vst1.8 {d11}, [r12], r1
vst1.8 {d12}, [r0], r1
vst1.8 {d13}, [r12], r1
vst1.8 {d14}, [r0], r1
vst1.8 {d15}, [r12], r1
vst1.8 {d16}, [r0], r1
vst1.8 {d17}, [r12], r1
vst1.8 {d18}, [r0], r1
vst1.8 {d19}, [r12], r1
vst1.8 {d20}, [r0]
vst1.8 {d21}, [r12]
pop {pc}
ENDP ; |vp8_mbloop_filter_vertical_edge_uv_neon|
; void vp8_mbloop_filter_neon()
@ -350,26 +311,19 @@
; functions do the necessary load, transpose (if necessary), preserve (if
; necessary) and store.
; TODO:
; The vertical filter writes p3/q3 back out because two 4 element writes are
; much simpler than ordering and writing two 3 element sets (or three 2 elements
; sets, or whichever other combinations are possible).
; If we can preserve q3 and q10, the vertical filter will be able to avoid
; storing those values on the stack and reading them back after the filter.
; r0,r1 PRESERVE
; r2 flimit
; r3 PRESERVE
; q1 limit
; r2 mblimit
; r3 limit
; q2 thresh
; q3 p3
; q3 p3 PRESERVE
; q4 p2
; q5 p1
; q6 p0
; q7 q0
; q8 q1
; q9 q2
; q10 q3
; q10 q3 PRESERVE
|vp8_mbloop_filter_neon| PROC
@ -378,12 +332,12 @@
vabd.u8 q12, q4, q5 ; abs(p2 - p1)
vabd.u8 q13, q5, q6 ; abs(p1 - p0)
vabd.u8 q14, q8, q7 ; abs(q1 - q0)
vabd.u8 q3, q9, q8 ; abs(q2 - q1)
vabd.u8 q1, q9, q8 ; abs(q2 - q1)
vabd.u8 q0, q10, q9 ; abs(q3 - q2)
vmax.u8 q11, q11, q12
vmax.u8 q12, q13, q14
vmax.u8 q3, q3, q0
vmax.u8 q1, q1, q0
vmax.u8 q15, q11, q12
vabd.u8 q12, q6, q7 ; abs(p0 - q0)
@ -391,44 +345,46 @@
; vp8_hevmask
vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh) * -1
vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh) * -1
vmax.u8 q15, q15, q3
vmax.u8 q15, q15, q1
vld1.s8 {d4[], d5[]}, [r2] ; flimit
vdup.u8 q1, r3 ; limit
vdup.u8 q2, r2 ; mblimit
vmov.u8 q0, #0x80 ; 0x80
vadd.u8 q2, q2, q2 ; flimit * 2
vadd.u8 q2, q2, q1 ; flimit * 2 + limit
vcge.u8 q15, q1, q15
vabd.u8 q1, q5, q8 ; a = abs(p1 - q1)
vqadd.u8 q12, q12, q12 ; b = abs(p0 - q0) * 2
vshr.u8 q1, q1, #1 ; a = a / 2
vqadd.u8 q12, q12, q1 ; a = b + a
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
vmov.u16 q11, #3 ; #3
; vp8_filter
; convert to signed
veor q7, q7, q0 ; qs0
vshr.u8 q1, q1, #1 ; a = a / 2
veor q6, q6, q0 ; ps0
veor q5, q5, q0 ; ps1
vqadd.u8 q12, q12, q1 ; a = b + a
veor q8, q8, q0 ; qs1
veor q4, q4, q0 ; ps2
veor q9, q9, q0 ; qs2
vorr q14, q13, q14 ; vp8_hevmask
vcge.u8 q12, q2, q12 ; (a > flimit * 2 + limit) * -1
vsubl.s8 q2, d14, d12 ; qs0 - ps0
vsubl.s8 q13, d15, d13
vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1)
vadd.s16 q10, q2, q2 ; 3 * (qs0 - ps0)
vadd.s16 q11, q13, q13
vmul.i16 q2, q2, q11 ; 3 * ( qs0 - ps0)
vand q15, q15, q12 ; vp8_filter_mask
vadd.s16 q2, q2, q10
vadd.s16 q13, q13, q11
vmul.i16 q13, q13, q11
vmov.u8 q12, #3 ; #3
@ -447,23 +403,19 @@
vand q13, q1, q14 ; Filter2 &= hev
vmov.u8 d7, #9 ; #9
vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4)
vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3)
vmov.u8 d6, #18 ; #18
vmov q0, q15
vshr.s8 q2, q2, #3 ; Filter1 >>= 3
vshr.s8 q13, q13, #3 ; Filter2 >>= 3
vmov q10, q15
vmov q11, q15
vmov q12, q15
vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1)
vmov.u8 d5, #27 ; #27
vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2)
vbic q1, q1, q14 ; vp8_filter &= ~hev
@ -471,35 +423,43 @@
; roughly 1/7th difference across boundary
; roughly 2/7th difference across boundary
; roughly 3/7th difference across boundary
vmov q11, q15
vmov.u8 d5, #9 ; #9
vmov.u8 d4, #18 ; #18
vmov q13, q15
vmov q14, q15
vmlal.s8 q10, d2, d7 ; Filter2 * 9
vmlal.s8 q11, d3, d7
vmlal.s8 q12, d2, d6 ; Filter2 * 18
vmlal.s8 q13, d3, d6
vmlal.s8 q14, d2, d5 ; Filter2 * 27
vmlal.s8 q0, d2, d5 ; 63 + Filter2 * 9
vmlal.s8 q11, d3, d5
vmov.u8 d5, #27 ; #27
vmlal.s8 q12, d2, d4 ; 63 + Filter2 * 18
vmlal.s8 q13, d3, d4
vmlal.s8 q14, d2, d5 ; 63 + Filter2 * 27
vmlal.s8 q15, d3, d5
vqshrn.s16 d20, q10, #7 ; u = clamp((63 + Filter2 * 9)>>7)
vqshrn.s16 d21, q11, #7
vqshrn.s16 d0, q0, #7 ; u = clamp((63 + Filter2 * 9)>>7)
vqshrn.s16 d1, q11, #7
vqshrn.s16 d24, q12, #7 ; u = clamp((63 + Filter2 * 18)>>7)
vqshrn.s16 d25, q13, #7
vqshrn.s16 d28, q14, #7 ; u = clamp((63 + Filter2 * 27)>>7)
vqshrn.s16 d29, q15, #7
vqsub.s8 q11, q9, q10 ; s = clamp(qs2 - u)
vqadd.s8 q10, q4, q10 ; s = clamp(ps2 + u)
vmov.u8 q1, #0x80 ; 0x80
vqsub.s8 q11, q9, q0 ; s = clamp(qs2 - u)
vqadd.s8 q0, q4, q0 ; s = clamp(ps2 + u)
vqsub.s8 q13, q8, q12 ; s = clamp(qs1 - u)
vqadd.s8 q12, q5, q12 ; s = clamp(ps1 + u)
vqsub.s8 q15, q7, q14 ; s = clamp(qs0 - u)
vqadd.s8 q14, q6, q14 ; s = clamp(ps0 + u)
veor q9, q11, q0 ; *oq2 = s^0x80
veor q4, q10, q0 ; *op2 = s^0x80
veor q8, q13, q0 ; *oq1 = s^0x80
veor q5, q12, q0 ; *op2 = s^0x80
veor q7, q15, q0 ; *oq0 = s^0x80
veor q6, q14, q0 ; *op0 = s^0x80
veor q9, q11, q1 ; *oq2 = s^0x80
veor q4, q0, q1 ; *op2 = s^0x80
veor q8, q13, q1 ; *oq1 = s^0x80
veor q5, q12, q1 ; *op2 = s^0x80
veor q7, q15, q1 ; *oq0 = s^0x80
veor q6, q14, q1 ; *op0 = s^0x80
bx lr
ENDP ; |vp8_mbloop_filter_neon|

View File

@ -108,9 +108,9 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_c;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_c;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_c;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_c;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_c;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_c;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_c;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_c;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_c;
#if CONFIG_POSTPROC || (CONFIG_VP8_ENCODER && CONFIG_INTERNAL_STATS)

View File

@ -9,152 +9,149 @@
*/
#include "vpx_ports/config.h"
#include "vpx_config.h"
#include "loopfilter.h"
#include "onyxc_int.h"
#include "vpx_mem/vpx_mem.h"
typedef unsigned char uc;
prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
/* Horizontal MB filtering */
void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
{
vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Vertical MB Filtering */
void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
{
vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
/* Horizontal B Filtering */
void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
{
vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, blimit);
}
/* Vertical B Filtering */
void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi)
{
vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride,
const unsigned char *blimit)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, blimit);
}
void vp8_init_loop_filter(VP8_COMMON *cm)
static void lf_init_lut(loop_filter_info_n *lfi)
{
loop_filter_info *lfi = cm->lf_info;
LOOPFILTERTYPE lft = cm->filter_type;
int sharpness_lvl = cm->sharpness_level;
int frame_type = cm->frame_type;
int i, j;
int filt_lvl;
int block_inside_limit = 0;
int HEVThresh;
/* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
for (filt_lvl = 0; filt_lvl <= MAX_LOOP_FILTER; filt_lvl++)
{
int filt_lvl = i;
if (frame_type == KEY_FRAME)
if (filt_lvl >= 40)
{
if (filt_lvl >= 40)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 2;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 3;
}
else if (filt_lvl >= 20)
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 2;
}
else if (filt_lvl >= 15)
{
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 1;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 1;
}
else
{
if (filt_lvl >= 40)
HEVThresh = 3;
else if (filt_lvl >= 20)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
lfi->hev_thr_lut[KEY_FRAME][filt_lvl] = 0;
lfi->hev_thr_lut[INTER_FRAME][filt_lvl] = 0;
}
}
lfi->mode_lf_lut[DC_PRED] = 1;
lfi->mode_lf_lut[V_PRED] = 1;
lfi->mode_lf_lut[H_PRED] = 1;
lfi->mode_lf_lut[TM_PRED] = 1;
lfi->mode_lf_lut[B_PRED] = 0;
lfi->mode_lf_lut[ZEROMV] = 1;
lfi->mode_lf_lut[NEARESTMV] = 2;
lfi->mode_lf_lut[NEARMV] = 2;
lfi->mode_lf_lut[NEWMV] = 2;
lfi->mode_lf_lut[SPLITMV] = 3;
}
void vp8_loop_filter_update_sharpness(loop_filter_info_n *lfi,
int sharpness_lvl)
{
int i;
/* For each possible value for the loop filter fill out limits */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
{
int filt_lvl = i;
int block_inside_limit = 0;
/* Set loop filter paramaeters that control sharpness. */
block_inside_limit = filt_lvl >> (sharpness_lvl > 0);
@ -169,119 +166,120 @@ void vp8_init_loop_filter(VP8_COMMON *cm)
if (block_inside_limit < 1)
block_inside_limit = 1;
for (j = 0; j < 16; j++)
{
lfi[i].lim[j] = block_inside_limit;
lfi[i].mbflim[j] = filt_lvl + 2;
lfi[i].flim[j] = filt_lvl;
lfi[i].thr[j] = HEVThresh;
}
}
/* Set up the function pointers depending on the type of loop filtering selected */
if (lft == NORMAL_LOOPFILTER)
{
cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v);
cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v);
cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h);
cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h);
}
else
{
cm->lf_mbv = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v);
cm->lf_bv = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v);
cm->lf_mbh = LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h);
cm->lf_bh = LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h);
vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH);
vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit),
SIMD_WIDTH);
vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit),
SIMD_WIDTH);
}
}
/* Put vp8_init_loop_filter() in vp8dx_create_decompressor(). Only call vp8_frame_init_loop_filter() while decoding
* each frame. Check last_frame_type to skip the function most of times.
*/
void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type)
void vp8_loop_filter_init(VP8_COMMON *cm)
{
int HEVThresh;
int i, j;
loop_filter_info_n *lfi = &cm->lf_info;
int i;
/* For each possible value for the loop filter fill out a "loop_filter_info" entry. */
for (i = 0; i <= MAX_LOOP_FILTER; i++)
/* init limits for given sharpness*/
vp8_loop_filter_update_sharpness(lfi, cm->sharpness_level);
cm->last_sharpness_level = cm->sharpness_level;
/* init LUT for lvl and hev thr picking */
lf_init_lut(lfi);
/* init hev threshold const vectors */
for(i = 0; i < 4 ; i++)
{
int filt_lvl = i;
if (frame_type == KEY_FRAME)
{
if (filt_lvl >= 40)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
else
{
if (filt_lvl >= 40)
HEVThresh = 3;
else if (filt_lvl >= 20)
HEVThresh = 2;
else if (filt_lvl >= 15)
HEVThresh = 1;
else
HEVThresh = 0;
}
for (j = 0; j < 16; j++)
{
/*lfi[i].lim[j] = block_inside_limit;
lfi[i].mbflim[j] = filt_lvl+2;*/
/*lfi[i].flim[j] = filt_lvl;*/
lfi[i].thr[j] = HEVThresh;
}
vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH);
}
}
int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level)
void vp8_loop_filter_frame_init(VP8_COMMON *cm,
MACROBLOCKD *mbd,
int default_filt_lvl,
int sharpness_lvl)
{
MB_MODE_INFO *mbmi = &mbd->mode_info_context->mbmi;
int seg, /* segment number */
ref, /* index in ref_lf_deltas */
mode; /* index in mode_lf_deltas */
if (mbd->mode_ref_lf_delta_enabled)
loop_filter_info_n *lfi = &cm->lf_info;
/* update limits if sharpness has changed */
if(cm->last_sharpness_level != sharpness_lvl)
{
vp8_loop_filter_update_sharpness(lfi, sharpness_lvl);
cm->last_sharpness_level = sharpness_lvl;
}
for(seg = 0; seg < MAX_MB_SEGMENTS; seg++)
{
int lvl_seg = default_filt_lvl;
int lvl_ref, lvl_mode;
/* Note the baseline filter values for each segment */
if (mbd->segmentation_enabled)
{
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
{
lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
}
else /* Delta Value */
{
lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63: lvl_seg) : 0;
}
}
if (!mbd->mode_ref_lf_delta_enabled)
{
/* we could get rid of this if we assume that deltas are set to
* zero when not in use; encoder always uses deltas
*/
vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4 );
continue;
}
lvl_ref = lvl_seg;
/* INTRA_FRAME */
ref = INTRA_FRAME;
/* Apply delta for reference frame */
filter_level += mbd->ref_lf_deltas[mbmi->ref_frame];
lvl_ref += mbd->ref_lf_deltas[ref];
/* Apply delta for mode */
if (mbmi->ref_frame == INTRA_FRAME)
/* Apply delta for Intra modes */
mode = 0; /* B_PRED */
/* Only the split mode BPRED has a further special case */
lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
lfi->lvl[seg][ref][mode] = lvl_mode;
mode = 1; /* all the rest of Intra modes */
lvl_mode = (lvl_ref > 0) ? (lvl_ref > 63 ? 63 : lvl_ref) : 0; /* clamp */
lfi->lvl[seg][ref][mode] = lvl_mode;
/* LAST, GOLDEN, ALT */
for(ref = 1; ref < MAX_REF_FRAMES; ref++)
{
/* Only the split mode BPRED has a further special case */
if (mbmi->mode == B_PRED)
filter_level += mbd->mode_lf_deltas[0];
int lvl_ref = lvl_seg;
/* Apply delta for reference frame */
lvl_ref += mbd->ref_lf_deltas[ref];
/* Apply delta for Inter modes */
for (mode = 1; mode < 4; mode++)
{
lvl_mode = lvl_ref + mbd->mode_lf_deltas[mode];
lvl_mode = (lvl_mode > 0) ? (lvl_mode > 63 ? 63 : lvl_mode) : 0; /* clamp */
lfi->lvl[seg][ref][mode] = lvl_mode;
}
}
else
{
/* Zero motion mode */
if (mbmi->mode == ZEROMV)
filter_level += mbd->mode_lf_deltas[1];
/* Split MB motion mode */
else if (mbmi->mode == SPLITMV)
filter_level += mbd->mode_lf_deltas[3];
/* All other inter motion modes (Nearest, Near, New) */
else
filter_level += mbd->mode_lf_deltas[2];
}
/* Range check */
if (filter_level > MAX_LOOP_FILTER)
filter_level = MAX_LOOP_FILTER;
else if (filter_level < 0)
filter_level = 0;
}
return filter_level;
}
void vp8_loop_filter_frame
(
VP8_COMMON *cm,
@ -290,49 +288,23 @@ void vp8_loop_filter_frame
)
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
loop_filter_info *lfi = cm->lf_info;
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
FRAME_TYPE frame_type = cm->frame_type;
int mb_row;
int mb_col;
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
int i;
unsigned char *y_ptr, *u_ptr, *v_ptr;
mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */
/* Note the baseline filter values for each segment */
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
/* Delta Value */
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
else
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
baseline_filter_level[i] = default_filt_lvl;
}
/* Point at base of Mb MODE_INFO list */
const MODE_INFO *mode_info_context = cm->mi;
/* Initialize the loop filter for this frame. */
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, cm->sharpness_level);
/* Set up the buffer pointers */
y_ptr = post->y_buffer;
@ -344,51 +316,79 @@ void vp8_loop_filter_frame
{
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
mbd->mode_info_context->mbmi.mode != SPLITMV &&
mbd->mode_info_context->mbmi.mb_skip_coeff);
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
filter_level = baseline_filter_level[Segment];
const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
/* Distance of Mb to the various image edges.
* These specified to 8th pel as they are always compared to values that are in 1/8th pel units
* Apply any context driven MB level adjustment
*/
filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
if (filter_level)
{
if (mb_col > 0)
cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
if (cm->filter_type == NORMAL_LOOPFILTER)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (!skip_lf)
cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
/* don't apply across umv border */
if (mb_row > 0)
cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
if (!skip_lf)
cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
}
else
{
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
}
}
y_ptr += 16;
u_ptr += 8;
v_ptr += 8;
mbd->mode_info_context++; /* step to next MB */
mode_info_context++; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
u_ptr += post->uv_stride * 8 - post->uv_width;
v_ptr += post->uv_stride * 8 - post->uv_width;
mbd->mode_info_context++; /* Skip border mb */
mode_info_context++; /* Skip border mb */
}
}
void vp8_loop_filter_frame_yonly
(
VP8_COMMON *cm,
@ -399,49 +399,28 @@ void vp8_loop_filter_frame_yonly
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
int i;
unsigned char *y_ptr;
int mb_row;
int mb_col;
loop_filter_info *lfi = cm->lf_info;
int baseline_filter_level[MAX_MB_SEGMENTS];
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
FRAME_TYPE frame_type = cm->frame_type;
(void) sharpness_lvl;
/* Point at base of Mb MODE_INFO list */
const MODE_INFO *mode_info_context = cm->mi;
/*MODE_INFO * this_mb_mode_info = cm->mi;*/ /* Point at base of Mb MODE_INFO list */
mbd->mode_info_context = cm->mi; /* Point at base of Mb MODE_INFO list */
sharpness_lvl = cm->sharpness_level;
/* Note the baseline filter values for each segment */
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
/* Delta Value */
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
else
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
baseline_filter_level[i] = default_filt_lvl;
}
#if 0
if(default_filt_lvl == 0) /* no filter applied */
return;
#endif
/* Initialize the loop filter for this frame. */
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
vp8_loop_filter_frame_init( cm, mbd, default_filt_lvl, sharpness_lvl);
/* Set up the buffer pointers */
y_ptr = post->y_buffer;
@ -451,44 +430,75 @@ void vp8_loop_filter_frame_yonly
{
for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
{
int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
mbd->mode_info_context->mbmi.mode != SPLITMV &&
mbd->mode_info_context->mbmi.mb_skip_coeff);
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
filter_level = baseline_filter_level[Segment];
const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
const int seg = mode_info_context->mbmi.segment_id;
const int ref_frame = mode_info_context->mbmi.ref_frame;
/* Apply any context driven MB level adjustment */
filter_level = vp8_adjust_mb_lf_value(mbd, filter_level);
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
if (filter_level)
{
if (mb_col > 0)
cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
if (cm->filter_type == NORMAL_LOOPFILTER)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (!skip_lf)
cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
/* don't apply across umv border */
if (mb_row > 0)
cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (!skip_lf)
cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
}
else
{
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
}
}
y_ptr += 16;
mbd->mode_info_context ++; /* step to next MB */
mode_info_context ++; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
mbd->mode_info_context ++; /* Skip border mb */
mode_info_context ++; /* Skip border mb */
}
}
void vp8_loop_filter_partial_frame
(
VP8_COMMON *cm,
@ -500,25 +510,32 @@ void vp8_loop_filter_partial_frame
{
YV12_BUFFER_CONFIG *post = cm->frame_to_show;
int i;
unsigned char *y_ptr;
int mb_row;
int mb_col;
/*int mb_rows = post->y_height >> 4;*/
int mb_cols = post->y_width >> 4;
int linestocopy;
int linestocopy, i;
loop_filter_info_n *lfi_n = &cm->lf_info;
loop_filter_info lfi;
loop_filter_info *lfi = cm->lf_info;
int baseline_filter_level[MAX_MB_SEGMENTS];
int filter_level;
int alt_flt_enabled = mbd->segmentation_enabled;
FRAME_TYPE frame_type = cm->frame_type;
(void) sharpness_lvl;
const MODE_INFO *mode_info_context;
/*MODE_INFO * this_mb_mode_info = cm->mi + (post->y_height>>5) * (mb_cols + 1);*/ /* Point at base of Mb MODE_INFO list */
mbd->mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); /* Point at base of Mb MODE_INFO list */
int lvl_seg[MAX_MB_SEGMENTS];
sharpness_lvl = cm->sharpness_level;
#if 0
if(default_filt_lvl == 0) /* no filter applied */
return;
#endif
mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1);
linestocopy = (post->y_height >> (4 + Fraction));
@ -531,29 +548,24 @@ void vp8_loop_filter_partial_frame
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
/* Abs value */
{ /* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
{
lvl_seg[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
}
/* Delta Value */
else
{
baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
lvl_seg[i] = default_filt_lvl
+ mbd->segment_feature_data[MB_LVL_ALT_LF][i];
lvl_seg[i] = (lvl_seg[i] > 0) ?
((lvl_seg[i] > 63) ? 63: lvl_seg[i]) : 0;
}
}
}
else
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
baseline_filter_level[i] = default_filt_lvl;
}
lvl_seg[0] = default_filt_lvl;
/* Initialize the loop filter for this frame. */
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
/* Set up the buffer pointers */
y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride;
@ -563,32 +575,64 @@ void vp8_loop_filter_partial_frame
{
for (mb_col = 0; mb_col < mb_cols; mb_col++)
{
int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED &&
mbd->mode_info_context->mbmi.mode != SPLITMV &&
mbd->mode_info_context->mbmi.mb_skip_coeff);
int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
mode_info_context->mbmi.mode != SPLITMV &&
mode_info_context->mbmi.mb_skip_coeff);
filter_level = baseline_filter_level[Segment];
if (alt_flt_enabled)
filter_level = lvl_seg[mode_info_context->mbmi.segment_id];
else
filter_level = lvl_seg[0];
if (filter_level)
{
if (mb_col > 0)
cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
if (cm->filter_type == NORMAL_LOOPFILTER)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (!skip_lf)
cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_v)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_v)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (!skip_lf)
cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]);
LF_INVOKE(&cm->rtcd.loopfilter, normal_mb_h)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, normal_b_h)
(y_ptr, 0, 0, post->y_stride, 0, &lfi);
}
else
{
if (mb_col > 0)
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_v)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_v)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
LF_INVOKE(&cm->rtcd.loopfilter, simple_mb_h)
(y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&cm->rtcd.loopfilter, simple_b_h)
(y_ptr, post->y_stride, lfi_n->blim[filter_level]);
}
}
y_ptr += 16;
mbd->mode_info_context += 1; /* step to next MB */
mode_info_context += 1; /* step to next MB */
}
y_ptr += post->y_stride * 16 - post->y_width;
mbd->mode_info_context += 1; /* Skip border mb */
mode_info_context += 1; /* Skip border mb */
}
}

View File

@ -13,6 +13,7 @@
#define loopfilter_h
#include "vpx_ports/mem.h"
#include "vpx_config.h"
#define MAX_LOOP_FILTER 63
@ -22,27 +23,46 @@ typedef enum
SIMPLE_LOOPFILTER = 1
} LOOPFILTERTYPE;
/* FRK
* Need to align this structure so when it is declared and
#if ARCH_ARM
#define SIMD_WIDTH 1
#else
#define SIMD_WIDTH 16
#endif
/* Need to align this structure so when it is declared and
* passed it can be loaded into vector registers.
*/
typedef struct
{
DECLARE_ALIGNED(16, signed char, lim[16]);
DECLARE_ALIGNED(16, signed char, flim[16]);
DECLARE_ALIGNED(16, signed char, thr[16]);
DECLARE_ALIGNED(16, signed char, mbflim[16]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]);
DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, hev_thr[4][SIMD_WIDTH]);
unsigned char lvl[4][4][4];
unsigned char hev_thr_lut[2][MAX_LOOP_FILTER + 1];
unsigned char mode_lf_lut[10];
} loop_filter_info_n;
typedef struct
{
const unsigned char * mblim;
const unsigned char * blim;
const unsigned char * lim;
const unsigned char * hev_thr;
} loop_filter_info;
#define prototype_loopfilter(sym) \
void sym(unsigned char *src, int pitch, const signed char *flimit,\
const signed char *limit, const signed char *thresh, int count)
void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
const unsigned char *limit, const unsigned char *thresh, int count)
#define prototype_loopfilter_block(sym) \
void sym(unsigned char *y, unsigned char *u, unsigned char *v,\
void sym(unsigned char *y, unsigned char *u, unsigned char *v, \
int ystride, int uv_stride, loop_filter_info *lfi)
#define prototype_simple_loopfilter(sym) \
void sym(unsigned char *y, int ystride, const unsigned char *blimit)
#if ARCH_X86 || ARCH_X86_64
#include "x86/loopfilter_x86.h"
#endif
@ -71,38 +91,39 @@ extern prototype_loopfilter_block(vp8_lf_normal_mb_h);
#endif
extern prototype_loopfilter_block(vp8_lf_normal_b_h);
#ifndef vp8_lf_simple_mb_v
#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_c
#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_c
#endif
extern prototype_loopfilter_block(vp8_lf_simple_mb_v);
extern prototype_simple_loopfilter(vp8_lf_simple_mb_v);
#ifndef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_c
#endif
extern prototype_loopfilter_block(vp8_lf_simple_b_v);
extern prototype_simple_loopfilter(vp8_lf_simple_b_v);
#ifndef vp8_lf_simple_mb_h
#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_c
#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_c
#endif
extern prototype_loopfilter_block(vp8_lf_simple_mb_h);
extern prototype_simple_loopfilter(vp8_lf_simple_mb_h);
#ifndef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_c
#endif
extern prototype_loopfilter_block(vp8_lf_simple_b_h);
extern prototype_simple_loopfilter(vp8_lf_simple_b_h);
typedef prototype_loopfilter_block((*vp8_lf_block_fn_t));
typedef prototype_simple_loopfilter((*vp8_slf_block_fn_t));
typedef struct
{
vp8_lf_block_fn_t normal_mb_v;
vp8_lf_block_fn_t normal_b_v;
vp8_lf_block_fn_t normal_mb_h;
vp8_lf_block_fn_t normal_b_h;
vp8_lf_block_fn_t simple_mb_v;
vp8_lf_block_fn_t simple_b_v;
vp8_lf_block_fn_t simple_mb_h;
vp8_lf_block_fn_t simple_b_h;
vp8_slf_block_fn_t simple_mb_v;
vp8_slf_block_fn_t simple_b_v;
vp8_slf_block_fn_t simple_mb_h;
vp8_slf_block_fn_t simple_b_h;
} vp8_loopfilter_rtcd_vtable_t;
#if CONFIG_RUNTIME_CPU_DETECT
@ -115,9 +136,9 @@ typedef void loop_filter_uvfunction
(
unsigned char *u, /* source pointer */
int p, /* pitch */
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
unsigned char *v
);

View File

@ -24,8 +24,9 @@ static __inline signed char vp8_signed_char_clamp(int t)
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
static __inline signed char vp8_filter_mask(signed char limit, signed char flimit,
uc p3, uc p2, uc p1, uc p0, uc q0, uc q1, uc q2, uc q3)
static __inline signed char vp8_filter_mask(uc limit, uc blimit,
uc p3, uc p2, uc p1, uc p0,
uc q0, uc q1, uc q2, uc q3)
{
signed char mask = 0;
mask |= (abs(p3 - p2) > limit) * -1;
@ -34,13 +35,13 @@ static __inline signed char vp8_filter_mask(signed char limit, signed char flimi
mask |= (abs(q1 - q0) > limit) * -1;
mask |= (abs(q2 - q1) > limit) * -1;
mask |= (abs(q3 - q2) > limit) * -1;
mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit) * -1;
mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask = ~mask;
return mask;
}
/* is there high variance internal edge ( 11111111 yes, 00000000 no) */
static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0, uc q1)
static __inline signed char vp8_hevmask(uc thresh, uc p1, uc p0, uc q0, uc q1)
{
signed char hev = 0;
hev |= (abs(p1 - p0) > thresh) * -1;
@ -48,7 +49,8 @@ static __inline signed char vp8_hevmask(signed char thresh, uc p1, uc p0, uc q0,
return hev;
}
static __inline void vp8_filter(signed char mask, signed char hev, uc *op1, uc *op0, uc *oq0, uc *oq1)
static __inline void vp8_filter(signed char mask, uc hev, uc *op1,
uc *op0, uc *oq0, uc *oq1)
{
signed char ps0, qs0;
@ -98,9 +100,9 @@ void vp8_loop_filter_horizontal_edge_c
(
unsigned char *s,
int p, /* pitch */
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count
)
{
@ -113,11 +115,11 @@ void vp8_loop_filter_horizontal_edge_c
*/
do
{
mask = vp8_filter_mask(limit[i], flimit[i],
mask = vp8_filter_mask(limit[0], blimit[0],
s[-4*p], s[-3*p], s[-2*p], s[-1*p],
s[0*p], s[1*p], s[2*p], s[3*p]);
hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
@ -130,9 +132,9 @@ void vp8_loop_filter_vertical_edge_c
(
unsigned char *s,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count
)
{
@ -145,10 +147,10 @@ void vp8_loop_filter_vertical_edge_c
*/
do
{
mask = vp8_filter_mask(limit[i], flimit[i],
mask = vp8_filter_mask(limit[0], blimit[0],
s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
vp8_filter(mask, hev, s - 2, s - 1, s, s + 1);
@ -157,7 +159,7 @@ void vp8_loop_filter_vertical_edge_c
while (++i < count * 8);
}
static __inline void vp8_mbfilter(signed char mask, signed char hev,
static __inline void vp8_mbfilter(signed char mask, uc hev,
uc *op2, uc *op1, uc *op0, uc *oq0, uc *oq1, uc *oq2)
{
signed char s, u;
@ -216,9 +218,9 @@ void vp8_mbloop_filter_horizontal_edge_c
(
unsigned char *s,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count
)
{
@ -232,11 +234,11 @@ void vp8_mbloop_filter_horizontal_edge_c
do
{
mask = vp8_filter_mask(limit[i], flimit[i],
mask = vp8_filter_mask(limit[0], blimit[0],
s[-4*p], s[-3*p], s[-2*p], s[-1*p],
s[0*p], s[1*p], s[2*p], s[3*p]);
hev = vp8_hevmask(thresh[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
hev = vp8_hevmask(thresh[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_mbfilter(mask, hev, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p, s + 2 * p);
@ -251,9 +253,9 @@ void vp8_mbloop_filter_vertical_edge_c
(
unsigned char *s,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
const unsigned char *blimit,
const unsigned char *limit,
const unsigned char *thresh,
int count
)
{
@ -264,10 +266,10 @@ void vp8_mbloop_filter_vertical_edge_c
do
{
mask = vp8_filter_mask(limit[i], flimit[i],
mask = vp8_filter_mask(limit[0], blimit[0],
s[-4], s[-3], s[-2], s[-1], s[0], s[1], s[2], s[3]);
hev = vp8_hevmask(thresh[i], s[-2], s[-1], s[0], s[1]);
hev = vp8_hevmask(thresh[0], s[-2], s[-1], s[0], s[1]);
vp8_mbfilter(mask, hev, s - 3, s - 2, s - 1, s, s + 1, s + 2);
@ -278,13 +280,13 @@ void vp8_mbloop_filter_vertical_edge_c
}
/* should we apply any filter at all ( 11111111 yes, 00000000 no) */
static __inline signed char vp8_simple_filter_mask(signed char limit, signed char flimit, uc p1, uc p0, uc q0, uc q1)
static __inline signed char vp8_simple_filter_mask(uc blimit, uc p1, uc p0, uc q0, uc q1)
{
/* Why does this cause problems for win32?
* error C2143: syntax error : missing ';' before 'type'
* (void) limit;
*/
signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= flimit * 2 + limit) * -1;
signed char mask = (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= blimit) * -1;
return mask;
}
@ -317,47 +319,37 @@ void vp8_loop_filter_simple_horizontal_edge_c
(
unsigned char *s,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count
const unsigned char *blimit
)
{
signed char mask = 0;
int i = 0;
(void) thresh;
do
{
/*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1*p],s[0*p]);*/
mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2*p], s[-1*p], s[0*p], s[1*p]);
mask = vp8_simple_filter_mask(blimit[0], s[-2*p], s[-1*p], s[0*p], s[1*p]);
vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
++s;
}
while (++i < count * 8);
while (++i < 16);
}
void vp8_loop_filter_simple_vertical_edge_c
(
unsigned char *s,
int p,
const signed char *flimit,
const signed char *limit,
const signed char *thresh,
int count
const unsigned char *blimit
)
{
signed char mask = 0;
int i = 0;
(void) thresh;
do
{
/*mask = vp8_simple_filter_mask( limit[i], flimit[i],s[-1],s[0]);*/
mask = vp8_simple_filter_mask(limit[i], flimit[i], s[-2], s[-1], s[0], s[1]);
mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
s += p;
}
while (++i < count * 8);
while (++i < 16);
}

View File

@ -83,6 +83,7 @@ typedef struct VP8_COMMON_RTCD
} VP8_COMMON_RTCD;
typedef struct VP8Common
{
struct vpx_internal_error_info error;
@ -107,7 +108,8 @@ typedef struct VP8Common
YV12_BUFFER_CONFIG post_proc_buffer;
YV12_BUFFER_CONFIG temp_scale_frame;
FRAME_TYPE last_frame_type; /* Save last frame's frame type for loopfilter init checking and motion search. */
FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */
FRAME_TYPE frame_type;
int show_frame;
@ -149,11 +151,9 @@ typedef struct VP8Common
INTERPOLATIONFILTERTYPE mcomp_filter_type;
LOOPFILTERTYPE last_filter_type;
LOOPFILTERTYPE filter_type;
loop_filter_info lf_info[MAX_LOOP_FILTER+1];
prototype_loopfilter_block((*lf_mbv));
prototype_loopfilter_block((*lf_mbh));
prototype_loopfilter_block((*lf_bv));
prototype_loopfilter_block((*lf_bh));
loop_filter_info_n lf_info;
int filter_level;
int last_sharpness_level;
int sharpness_level;
@ -206,10 +206,9 @@ typedef struct VP8Common
struct postproc_state postproc_state;
} VP8_COMMON;
int vp8_adjust_mb_lf_value(MACROBLOCKD *mbd, int filter_level);
void vp8_init_loop_filter(VP8_COMMON *cm);
void vp8_frame_init_loop_filter(loop_filter_info *lfi, int frame_type);
extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val);
void vp8_loop_filter_init(VP8_COMMON *cm);
void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
int default_filt_lvl, int sharpness_lvl);
void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val);
#endif

View File

@ -16,7 +16,7 @@
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
@ -122,12 +122,10 @@ next8_h:
paddusb mm5, mm5 ; abs(p0-q0)*2
paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
mov rdx, arg(2) ;flimit ; get flimit
movq mm2, [rdx] ; flimit mm2
paddb mm2, mm2 ; flimit*2 (less than 255)
paddb mm7, mm2 ; flimit * 2 + limit (less than 255)
mov rdx, arg(2) ;blimit ; get blimit
movq mm7, [rdx] ; blimit
psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
por mm1, mm5
pxor mm5, mm5
pcmpeqb mm1, mm5 ; mask mm1
@ -230,7 +228,7 @@ next8_h:
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
@ -406,9 +404,9 @@ next8_v:
pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw mm5, 1 ; abs(p1-q1)/2
mov rdx, arg(2) ;flimit ;
mov rdx, arg(2) ;blimit ;
movq mm2, [rdx] ;flimit mm2
movq mm4, [rdx] ;blimit
movq mm1, mm3 ; mm1=mm3=p0
movq mm7, mm6 ; mm7=mm6=q0
@ -419,10 +417,7 @@ next8_v:
paddusb mm1, mm1 ; abs(q0-p0)*2
paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
paddb mm2, mm2 ; flimit*2 (less than 255)
paddb mm4, mm2 ; flimit * 2 + limit (less than 255)
psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
por mm1, mm0; ; mask
pxor mm0, mm0
@ -603,7 +598,7 @@ next8_v:
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
@ -719,17 +714,15 @@ next8_mbh:
paddusb mm5, mm5 ; abs(p0-q0)*2
paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
mov rdx, arg(2) ;flimit ; get flimit
movq mm2, [rdx] ; flimit mm2
paddb mm2, mm2 ; flimit*2 (less than 255)
paddb mm7, mm2 ; flimit * 2 + limit (less than 255)
mov rdx, arg(2) ;blimit ; get blimit
movq mm7, [rdx] ; blimit
psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
por mm1, mm5
pxor mm5, mm5
pcmpeqb mm1, mm5 ; mask mm1
; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
; mm6 = p0,
; calculate high edge variance
@ -922,7 +915,7 @@ next8_mbh:
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
@ -1108,9 +1101,9 @@ next8_mbv:
pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
psrlw mm5, 1 ; abs(p1-q1)/2
mov rdx, arg(2) ;flimit ;
mov rdx, arg(2) ;blimit ;
movq mm2, [rdx] ;flimit mm2
movq mm4, [rdx] ;blimit
movq mm1, mm3 ; mm1=mm3=p0
movq mm7, mm6 ; mm7=mm6=q0
@ -1121,10 +1114,7 @@ next8_mbv:
paddusb mm1, mm1 ; abs(q0-p0)*2
paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
paddb mm2, mm2 ; flimit*2 (less than 255)
paddb mm4, mm2 ; flimit * 2 + limit (less than 255)
psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
por mm1, mm0; ; mask
pxor mm0, mm0
@ -1392,16 +1382,13 @@ next8_mbv:
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *limit,
; const char *thresh,
; int count
; const char *blimit
;)
global sym(vp8_loop_filter_simple_horizontal_edge_mmx)
sym(vp8_loop_filter_simple_horizontal_edge_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
push rsi
push rdi
@ -1410,14 +1397,10 @@ sym(vp8_loop_filter_simple_horizontal_edge_mmx):
mov rsi, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
movsxd rcx, dword ptr arg(5) ;count
mov rcx, 2 ; count
nexts8_h:
mov rdx, arg(3) ;limit
movq mm7, [rdx]
mov rdx, arg(2) ;flimit ; get flimit
mov rdx, arg(2) ;blimit ; get blimit
movq mm3, [rdx] ;
paddb mm3, mm3 ; flimit*2 (less than 255)
paddb mm3, mm7 ; flimit * 2 + limit (less than 255)
mov rdi, rsi ; rdi points to row +1 for indirect addressing
add rdi, rax
@ -1445,7 +1428,7 @@ nexts8_h:
paddusb mm5, mm5 ; abs(p0-q0)*2
paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor mm3, mm3
pcmpeqb mm5, mm3
@ -1515,16 +1498,13 @@ nexts8_h:
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *limit,
; const char *thresh,
; int count
; const char *blimit
;)
global sym(vp8_loop_filter_simple_vertical_edge_mmx)
sym(vp8_loop_filter_simple_vertical_edge_mmx):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SHADOW_ARGS_TO_STACK 3
GET_GOT rbx
push rsi
push rdi
@ -1539,7 +1519,7 @@ sym(vp8_loop_filter_simple_vertical_edge_mmx):
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
lea rsi, [rsi + rax*4- 2]; ;
movsxd rcx, dword ptr arg(5) ;count
mov rcx, 2 ; count
nexts8_v:
lea rdi, [rsi + rax];
@ -1602,14 +1582,10 @@ nexts8_v:
paddusb mm5, mm5 ; abs(p0-q0)*2
paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
mov rdx, arg(2) ;flimit ; get flimit
mov rdx, arg(2) ;blimit ; get blimit
movq mm7, [rdx]
mov rdx, arg(3) ; get limit
movq mm6, [rdx]
paddb mm7, mm7 ; flimit*2 (less than 255)
paddb mm7, mm6 ; flimit * 2 + limit (less than 255)
psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor mm7, mm7
pcmpeqb mm5, mm7 ; mm5 = mask

View File

@ -110,7 +110,7 @@
psubusb xmm6, xmm5 ; p1-=p0
por xmm6, xmm4 ; abs(p1 - p0)
mov rdx, arg(2) ; get flimit
mov rdx, arg(2) ; get blimit
movdqa t1, xmm6 ; save to t1
@ -123,7 +123,7 @@
psubusb xmm1, xmm7
por xmm2, xmm3 ; abs(p1-q1)
movdqa xmm4, XMMWORD PTR [rdx] ; flimit
movdqa xmm7, XMMWORD PTR [rdx] ; blimit
movdqa xmm3, xmm0 ; q0
pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
@ -134,13 +134,11 @@
psrlw xmm2, 1 ; abs(p1-q1)/2
psubusb xmm5, xmm3 ; p0-=q0
paddb xmm4, xmm4 ; flimit*2 (less than 255)
psubusb xmm3, xmm6 ; q0-=p0
por xmm5, xmm3 ; abs(p0 - q0)
paddusb xmm5, xmm5 ; abs(p0-q0)*2
paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255)
movdqa xmm4, t0 ; hev get abs (q1 - q0)
@ -150,7 +148,7 @@
movdqa xmm2, XMMWORD PTR [rdx] ; hev
psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
psubusb xmm4, xmm2 ; hev
psubusb xmm3, xmm2 ; hev
@ -278,7 +276,7 @@
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
@ -328,7 +326,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
@ -574,7 +572,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
@ -624,7 +622,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2):
;(
; unsigned char *u,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; unsigned char *v
@ -904,7 +902,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
movdqa xmm4, XMMWORD PTR [rdx]; limit
pmaxub xmm0, xmm7
mov rdx, arg(2) ; flimit
mov rdx, arg(2) ; blimit
psubusb xmm0, xmm4
movdqa xmm5, xmm2 ; q1
@ -921,12 +919,11 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
psrlw xmm5, 1 ; abs(p1-q1)/2
psubusb xmm6, xmm3 ; q0-p0
movdqa xmm2, XMMWORD PTR [rdx]; flimit
movdqa xmm4, XMMWORD PTR [rdx]; blimit
mov rdx, arg(4) ; get thresh
por xmm1, xmm6 ; abs(q0-p0)
paddb xmm2, xmm2 ; flimit*2 (less than 255)
movdqa xmm6, t0 ; get abs (q1 - q0)
@ -939,10 +936,9 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh
paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255)
psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
por xmm1, xmm0 ; mask
@ -1014,7 +1010,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
@ -1081,7 +1077,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
;(
; unsigned char *u,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; unsigned char *v
@ -1239,7 +1235,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; int count
@ -1308,7 +1304,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2):
;(
; unsigned char *u,
; int src_pixel_step,
; const char *flimit,
; const char *blimit,
; const char *limit,
; const char *thresh,
; unsigned char *v
@ -1376,16 +1372,13 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *limit,
; const char *thresh,
; int count
; const char *blimit,
;)
global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
sym(vp8_loop_filter_simple_horizontal_edge_sse2):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SHADOW_ARGS_TO_STACK 3
SAVE_XMM 7
GET_GOT rbx
push rsi
@ -1394,13 +1387,8 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
mov rsi, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
mov rdx, arg(2) ;flimit ; get flimit
mov rdx, arg(2) ;blimit
movdqa xmm3, XMMWORD PTR [rdx]
mov rdx, arg(3) ;limit
movdqa xmm7, XMMWORD PTR [rdx]
paddb xmm3, xmm3 ; flimit*2 (less than 255)
paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255)
mov rdi, rsi ; rdi points to row +1 for indirect addressing
add rdi, rax
@ -1428,7 +1416,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
paddusb xmm5, xmm5 ; abs(p0-q0)*2
paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor xmm3, xmm3
pcmpeqb xmm5, xmm3
@ -1493,16 +1481,13 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
;(
; unsigned char *src_ptr,
; int src_pixel_step,
; const char *flimit,
; const char *limit,
; const char *thresh,
; int count
; const char *blimit,
;)
global sym(vp8_loop_filter_simple_vertical_edge_sse2)
sym(vp8_loop_filter_simple_vertical_edge_sse2):
push rbp ; save old base pointer value.
mov rbp, rsp ; set new base pointer value.
SHADOW_ARGS_TO_STACK 6
SHADOW_ARGS_TO_STACK 3
SAVE_XMM 7
GET_GOT rbx ; save callee-saved reg
push rsi
@ -1607,14 +1592,10 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
paddusb xmm5, xmm5 ; abs(p0-q0)*2
paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
mov rdx, arg(2) ;flimit
mov rdx, arg(2) ;blimit
movdqa xmm7, XMMWORD PTR [rdx]
mov rdx, arg(3) ; get limit
movdqa xmm6, XMMWORD PTR [rdx]
paddb xmm7, xmm7 ; flimit*2 (less than 255)
paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255)
psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
pxor xmm7, xmm7
pcmpeqb xmm5, xmm7 ; mm5 = mask

View File

@ -9,30 +9,18 @@
*/
#include "vpx_ports/config.h"
#include "vpx_config.h"
#include "vp8/common/loopfilter.h"
prototype_loopfilter(vp8_loop_filter_horizontal_edge_c);
prototype_loopfilter(vp8_loop_filter_vertical_edge_c);
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_c);
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_c);
prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_c);
prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c);
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_mmx);
prototype_loopfilter(vp8_loop_filter_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_mmx);
prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2);
prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2);
prototype_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
prototype_loopfilter(vp8_fast_loop_filter_vertical_edges_sse2);
extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
@ -44,23 +32,13 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2;
void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
vp8_mbloop_filter_horizontal_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_horizontal_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
@ -68,23 +46,13 @@ void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
vp8_mbloop_filter_vertical_edge_mmx(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, 1);
}
void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_vertical_edge_mmx(v_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 1);
}
@ -92,27 +60,23 @@ void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsign
void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_horizontal_edge_mmx(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_horizontal_edge_mmx(v_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, blimit);
}
@ -120,27 +84,23 @@ void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_vertical_edge_mmx(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
if (v_ptr)
vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, 1);
vp8_loop_filter_vertical_edge_mmx(v_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
}
#endif
@ -150,20 +110,10 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
}
void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
}
@ -171,20 +121,10 @@ void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr, v_ptr);
}
void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2);
vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
}
@ -192,24 +132,20 @@ void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsig
void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4 * uv_stride);
vp8_loop_filter_horizontal_edge_uv_sse2(u_ptr + 4 * uv_stride, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4 * uv_stride);
}
void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, blimit);
vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, blimit);
}
@ -217,36 +153,20 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
if (u_ptr)
vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr, v_ptr + 4);
vp8_loop_filter_vertical_edge_uv_sse2(u_ptr + 4, uv_stride, lfi->blim, lfi->lim, lfi->hev_thr, v_ptr + 4);
}
void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit)
{
(void) u_ptr;
(void) v_ptr;
(void) uv_stride;
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
}
#endif
#if 0
void vp8_fast_loop_filter_vertical_edges_sse(unsigned char *y_ptr,
int y_stride,
loop_filter_info *lfi)
{
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2);
}
#endif

View File

@ -24,10 +24,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_mmx);
extern prototype_loopfilter_block(vp8_loop_filter_bv_mmx);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_mmx);
extern prototype_loopfilter_block(vp8_loop_filter_bh_mmx);
extern prototype_loopfilter_block(vp8_loop_filter_mbvs_mmx);
extern prototype_loopfilter_block(vp8_loop_filter_bvs_mmx);
extern prototype_loopfilter_block(vp8_loop_filter_mbhs_mmx);
extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx);
extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
extern prototype_simple_loopfilter(vp8_loop_filter_bvs_mmx);
extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_mmx);
extern prototype_simple_loopfilter(vp8_loop_filter_bhs_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
@ -44,13 +44,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_mmx);
#define vp8_lf_normal_b_h vp8_loop_filter_bh_mmx
#undef vp8_lf_simple_mb_v
#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_mmx
#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_mmx
#undef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_mmx
#undef vp8_lf_simple_mb_h
#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_mmx
#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_mmx
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_mmx
@ -63,10 +63,10 @@ extern prototype_loopfilter_block(vp8_loop_filter_mbv_sse2);
extern prototype_loopfilter_block(vp8_loop_filter_bv_sse2);
extern prototype_loopfilter_block(vp8_loop_filter_mbh_sse2);
extern prototype_loopfilter_block(vp8_loop_filter_bh_sse2);
extern prototype_loopfilter_block(vp8_loop_filter_mbvs_sse2);
extern prototype_loopfilter_block(vp8_loop_filter_bvs_sse2);
extern prototype_loopfilter_block(vp8_loop_filter_mbhs_sse2);
extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2);
extern prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_sse2);
extern prototype_simple_loopfilter(vp8_loop_filter_bvs_sse2);
extern prototype_simple_loopfilter(vp8_loop_filter_simple_horizontal_edge_sse2);
extern prototype_simple_loopfilter(vp8_loop_filter_bhs_sse2);
#if !CONFIG_RUNTIME_CPU_DETECT
@ -83,13 +83,13 @@ extern prototype_loopfilter_block(vp8_loop_filter_bhs_sse2);
#define vp8_lf_normal_b_h vp8_loop_filter_bh_sse2
#undef vp8_lf_simple_mb_v
#define vp8_lf_simple_mb_v vp8_loop_filter_mbvs_sse2
#define vp8_lf_simple_mb_v vp8_loop_filter_simple_vertical_edge_sse2
#undef vp8_lf_simple_b_v
#define vp8_lf_simple_b_v vp8_loop_filter_bvs_sse2
#undef vp8_lf_simple_mb_h
#define vp8_lf_simple_mb_h vp8_loop_filter_mbhs_sse2
#define vp8_lf_simple_mb_h vp8_loop_filter_simple_horizontal_edge_sse2
#undef vp8_lf_simple_b_h
#define vp8_lf_simple_b_h vp8_loop_filter_bhs_sse2

View File

@ -9,7 +9,7 @@
*/
#include "vpx_ports/config.h"
#include "vpx_config.h"
#include "vpx_ports/x86.h"
#include "vp8/common/g_common.h"
#include "vp8/common/subpixel.h"
@ -63,9 +63,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_mmx;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_mmx;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_mmx;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_mmx;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_mmx;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_mmx;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_mmx;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_mmx;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_mmx;
#if CONFIG_POSTPROC
@ -101,9 +101,9 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
rtcd->loopfilter.normal_b_v = vp8_loop_filter_bv_sse2;
rtcd->loopfilter.normal_mb_h = vp8_loop_filter_mbh_sse2;
rtcd->loopfilter.normal_b_h = vp8_loop_filter_bh_sse2;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_mbvs_sse2;
rtcd->loopfilter.simple_mb_v = vp8_loop_filter_simple_vertical_edge_sse2;
rtcd->loopfilter.simple_b_v = vp8_loop_filter_bvs_sse2;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_mbhs_sse2;
rtcd->loopfilter.simple_mb_h = vp8_loop_filter_simple_horizontal_edge_sse2;
rtcd->loopfilter.simple_b_h = vp8_loop_filter_bhs_sse2;
#if CONFIG_POSTPROC

View File

@ -180,11 +180,11 @@ static MB_PREDICTION_MODE read_mv_ref(vp8_reader *bc, const vp8_prob *p)
return (MB_PREDICTION_MODE)i;
}
static MB_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p)
static B_PREDICTION_MODE sub_mv_ref(vp8_reader *bc, const vp8_prob *p)
{
const int i = vp8_treed_read(bc, vp8_sub_mv_ref_tree, p);
return (MB_PREDICTION_MODE)i;
return (B_PREDICTION_MODE)i;
}
#ifdef VPX_MODE_COUNT
@ -376,7 +376,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
abovemv.as_int = above_block_mv(mi, k, mis);
mv_contz = vp8_mv_cont(&leftmv, &abovemv);
switch ((B_PREDICTION_MODE) sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/
switch (sub_mv_ref(bc, vp8_sub_mv_ref_prob2 [mv_contz])) /*pc->fc.sub_mv_ref_prob))*/
{
case NEW4X4:
read_mv(bc, &blockmv.as_mv, (const MV_CONTEXT *) mvc);

View File

@ -94,7 +94,7 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf)
{
VP8_COMMON *cm = &pbi->common;
vp8_init_loop_filter(cm);
vp8_loop_filter_init(cm);
cm->last_frame_type = KEY_FRAME;
cm->last_filter_type = cm->filter_type;
cm->last_sharpness_level = cm->sharpness_level;

View File

@ -274,9 +274,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data)
int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
int filter_level;
loop_filter_info *lfi = pc->lf_info;
int alt_flt_enabled = xd->segmentation_enabled;
int Segment;
loop_filter_info_n *lfi_n = &pc->lf_info;
pbi->mb_row_di[ithread].mb_row = mb_row;
pbi->mb_row_di[ithread].mbd.current_bc = &pbi->mbc[mb_row%num_part];
@ -362,7 +360,16 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data)
if (pbi->common.filter_level)
{
int skip_lf;
int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
xd->mode_info_context->mbmi.mode != SPLITMV &&
xd->mode_info_context->mbmi.mb_skip_coeff);
const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
const int seg = xd->mode_info_context->mbmi.segment_id;
const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
if( mb_row != pc->mb_rows-1 )
{
/* Save decoded MB last row data for next-row decoding */
@ -388,35 +395,57 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data)
}
}
/* update loopfilter info */
Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
xd->mode_info_context->mbmi.mode != SPLITMV &&
xd->mode_info_context->mbmi.mb_skip_coeff);
filter_level = pbi->mt_baseline_filter_level[Segment];
/* Distance of Mb to the various image edges.
* These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
* Apply any context driven MB level adjustment
*/
filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
/* loopfilter on this macroblock. */
if (filter_level)
{
if (mb_col > 0)
pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
if(pc->filter_type == NORMAL_LOOPFILTER)
{
loop_filter_info lfi;
FRAME_TYPE frame_type = pc->frame_type;
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (!skip_lf)
pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
if (mb_col > 0)
LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v)
(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
/* don't apply across umv border */
if (mb_row > 0)
pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
if (!skip_lf)
LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v)
(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
if (!skip_lf)
pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h)
(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
if (!skip_lf)
LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h)
(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
}
else
{
if (mb_col > 0)
LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v)
(xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v)
(xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h)
(xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h)
(xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
}
}
}
recon_yoffset += 16;
@ -681,53 +710,6 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi)
}
}
static void lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
{
VP8_COMMON *cm = &pbi->common;
MACROBLOCKD *mbd = &pbi->mb;
/*YV12_BUFFER_CONFIG *post = &cm->new_frame;*/ /*frame_to_show;*/
loop_filter_info *lfi = cm->lf_info;
FRAME_TYPE frame_type = cm->frame_type;
/*int mb_row;
int mb_col;
int baseline_filter_level[MAX_MB_SEGMENTS];*/
int alt_flt_enabled = mbd->segmentation_enabled;
int i;
/*unsigned char *y_ptr, *u_ptr, *v_ptr;*/
/* Note the baseline filter values for each segment */
if (alt_flt_enabled)
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
{
/* Abs value */
if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
pbi->mt_baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
/* Delta Value */
else
{
pbi->mt_baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
pbi->mt_baseline_filter_level[i] = (pbi->mt_baseline_filter_level[i] >= 0) ? ((pbi->mt_baseline_filter_level[i] <= MAX_LOOP_FILTER) ? pbi->mt_baseline_filter_level[i] : MAX_LOOP_FILTER) : 0; /* Clamp to valid range */
}
}
}
else
{
for (i = 0; i < MAX_MB_SEGMENTS; i++)
pbi->mt_baseline_filter_level[i] = default_filt_lvl;
}
/* Initialize the loop filter for this frame. */
if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
vp8_init_loop_filter(cm);
else if (frame_type != cm->last_frame_type)
vp8_frame_init_loop_filter(lfi, frame_type);
}
void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
{
int mb_row;
@ -738,12 +720,10 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
volatile int *last_row_current_mb_col = NULL;
int nsync = pbi->sync_range;
int filter_level;
loop_filter_info *lfi = pc->lf_info;
int alt_flt_enabled = xd->segmentation_enabled;
int Segment;
int filter_level = pc->filter_level;
loop_filter_info_n *lfi_n = &pc->lf_info;
if(pbi->common.filter_level)
if (filter_level)
{
/* Set above_row buffer to 127 for decoding first MB row */
vpx_memset(pbi->mt_yabove_row[0] + VP8BORDERINPIXELS-1, 127, pc->yv12_fb[pc->lst_fb_idx].y_width + 5);
@ -764,7 +744,9 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
vpx_memset(pbi->mt_uleft_col[i], (unsigned char)129, 8);
vpx_memset(pbi->mt_vleft_col[i], (unsigned char)129, 8);
}
lpf_init(pbi, pc->filter_level);
/* Initialize the loop filter for this frame. */
vp8_loop_filter_frame_init(pc, &pbi->mb, filter_level, pc->sharpness_level);
}
setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
@ -774,7 +756,6 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
{
xd->current_bc = &pbi->mbc[mb_row%num_part];
/* vp8_decode_mb_row(pbi, pc, mb_row, xd); */
@ -875,7 +856,16 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
if (pbi->common.filter_level)
{
int skip_lf;
int skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
xd->mode_info_context->mbmi.mode != SPLITMV &&
xd->mode_info_context->mbmi.mb_skip_coeff);
const int mode_index = lfi_n->mode_lf_lut[xd->mode_info_context->mbmi.mode];
const int seg = xd->mode_info_context->mbmi.segment_id;
const int ref_frame = xd->mode_info_context->mbmi.ref_frame;
filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
/* Save decoded MB last row data for next-row decoding */
if(mb_row != pc->mb_rows-1)
{
@ -901,36 +891,58 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
}
}
/* update loopfilter info */
Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0;
skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED &&
xd->mode_info_context->mbmi.mode != SPLITMV &&
xd->mode_info_context->mbmi.mb_skip_coeff);
filter_level = pbi->mt_baseline_filter_level[Segment];
/* Distance of Mb to the various image edges.
* These are specified to 8th pel as they are always compared to values that are in 1/8th pel units
* Apply any context driven MB level adjustment
*/
filter_level = vp8_adjust_mb_lf_value(xd, filter_level);
/* loopfilter on this macroblock. */
if (filter_level)
{
if (mb_col > 0)
pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
if(pc->filter_type == NORMAL_LOOPFILTER)
{
loop_filter_info lfi;
FRAME_TYPE frame_type = pc->frame_type;
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
lfi.blim = lfi_n->blim[filter_level];
lfi.lim = lfi_n->lim[filter_level];
lfi.hev_thr = lfi_n->hev_thr[hev_index];
if (!skip_lf)
pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
if (mb_col > 0)
LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_v)
(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
/* don't apply across umv border */
if (mb_row > 0)
pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
if (!skip_lf)
LF_INVOKE(&pc->rtcd.loopfilter, normal_b_v)
(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
if (!skip_lf)
pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&pc->rtcd.loopfilter, normal_mb_h)
(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
if (!skip_lf)
LF_INVOKE(&pc->rtcd.loopfilter, normal_b_h)
(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi);
}
else
{
if (mb_col > 0)
LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_v)
(xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&pc->rtcd.loopfilter, simple_b_v)
(xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
/* don't apply across umv border */
if (mb_row > 0)
LF_INVOKE(&pc->rtcd.loopfilter, simple_mb_h)
(xd->dst.y_buffer, recon_y_stride, lfi_n->mblim[filter_level]);
if (!skip_lf)
LF_INVOKE(&pc->rtcd.loopfilter, simple_b_h)
(xd->dst.y_buffer, recon_y_stride, lfi_n->blim[filter_level]);
}
}
}
}
recon_yoffset += 16;
recon_uvoffset += 8;

View File

@ -2170,7 +2170,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
//when needed. This will avoid unnecessary calls of vp8cx_init_quantizer() for every frame.
vp8cx_init_quantizer(cpi);
{
vp8_init_loop_filter(cm);
vp8_loop_filter_init(cm);
cm->last_frame_type = KEY_FRAME;
cm->last_filter_type = cm->filter_type;
cm->last_sharpness_level = cm->sharpness_level;

View File

@ -262,10 +262,19 @@ static void vp8_temporal_filter_iterate_c
for (mb_row = 0; mb_row < mb_rows; mb_row++)
{
#if ALT_REF_MC_ENABLED
// Reduced search extent by 3 for 6-tap filter & smaller UMV border
cpi->mb.mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 19));
// Source frames are extended to 16 pixels. This is different than
// L/A/G reference frames that have a border of 32 (VP8BORDERINPIXELS)
// A 6 tap filter is used for motion search. This requires 2 pixels
// before and 3 pixels after. So the largest Y mv on a border would
// then be 16 - 3. The UV blocks are half the size of the Y and
// therefore only extended by 8. The largest mv that a UV block
// can support is 8 - 3. A UV mv is half of a Y mv.
// (16 - 3) >> 1 == 6 which is greater than 8 - 3.
// To keep the mv in play for both Y and UV planes the max that it
// can be on a border is therefore 16 - 5.
cpi->mb.mv_row_min = -((mb_row * 16) + (16 - 5));
cpi->mb.mv_row_max = ((cpi->common.mb_rows - 1 - mb_row) * 16)
+ (VP8BORDERINPIXELS - 19);
+ (16 - 5);
#endif
for (mb_col = 0; mb_col < mb_cols; mb_col++)
@ -277,10 +286,9 @@ static void vp8_temporal_filter_iterate_c
vpx_memset(count, 0, 384*sizeof(unsigned short));
#if ALT_REF_MC_ENABLED
// Reduced search extent by 3 for 6-tap filter & smaller UMV border
cpi->mb.mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 19));
cpi->mb.mv_col_min = -((mb_col * 16) + (16 - 5));
cpi->mb.mv_col_max = ((cpi->common.mb_cols - 1 - mb_col) * 16)
+ (VP8BORDERINPIXELS - 19);
+ (16 - 5);
#endif
for (frame = 0; frame < frame_count; frame++)

View File

@ -1,30 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef __VPX_MEM_NDS_H__
#define __VPX_MEM_NDS_H__
#if defined(__cplusplus)
extern "C" {
#endif
#include <nitro.h>
#include <nitro/os.h>
void *vpx_mem_nds_alloc(osarena_id id, osheap_handle handle, size_t size, size_t align);
void vpx_mem_nds_free(osarena_id id, osheap_handle handle, void *mem);
int vpx_nds_alloc_heap(osarena_id id, u32 size);
#if defined(__cplusplus)
}
#endif
#endif /*__VPX_MEM_NDS_H__*/

View File

@ -36,9 +36,6 @@
# include <winbase.h>
#elif defined(VXWORKS)
# include <sem_lib.h>
#elif defined(NDS_NITRO)
# include <nitro.h>
# include <nitro/os.h>
#endif
#include <stdio.h>
@ -112,8 +109,6 @@ struct memory_tracker
HANDLE mutex;
#elif defined(VXWORKS)
SEM_ID mutex;
#elif defined(NDS_NITRO)
OSMutex mutex;
#elif defined(NO_MUTEX)
#else
#error "No mutex type defined for this platform!"
@ -193,9 +188,6 @@ int vpx_memory_tracker_init(int padding_size, int pad_value)
memtrack.mutex = sem_bcreate(SEM_Q_FIFO, /*SEM_Q_FIFO non-priority based mutex*/
SEM_FULL); /*SEM_FULL initial state is unlocked*/
ret = !memtrack.mutex;
#elif defined(NDS_NITRO)
os_init_mutex(&memtrack.mutex);
ret = 0;
#elif defined(NO_MUTEX)
ret = 0;
#endif
@ -251,9 +243,7 @@ void vpx_memory_tracker_destroy()
if (!g_logging.type && g_logging.file && g_logging.file != stderr)
{
#if !defined(NDS_NITRO)
fclose(g_logging.file);
#endif
g_logging.file = NULL;
}
@ -368,15 +358,12 @@ int vpx_memory_tracker_set_log_type(int type, char *option)
g_logging.file = stderr;
ret = 0;
}
#if !defined(NDS_NITRO)
else
{
if ((g_logging.file = fopen((char *)option, "w")))
ret = 0;
}
#endif
break;
#if defined(WIN32) && !defined(_WIN32_WCE)
case 1:
@ -506,12 +493,6 @@ static void memory_tracker_dump()
p->addr, i, p->size,
p->file, p->line);
#ifdef NDS_NITRO
if (!(i % 20)) os_sleep(500);
#endif
p = p->next;
++i;
}
@ -719,9 +700,6 @@ static int memory_tracker_lock_mutex()
ret = WaitForSingleObject(memtrack.mutex, INFINITE);
#elif defined(VXWORKS)
ret = sem_take(memtrack.mutex, WAIT_FOREVER);
#elif defined(NDS_NITRO)
os_lock_mutex(&memtrack.mutex);
ret = 0;
#endif
if (ret)
@ -754,9 +732,6 @@ static int memory_tracker_unlock_mutex()
ret = !ReleaseMutex(memtrack.mutex);
#elif defined(VXWORKS)
ret = sem_give(memtrack.mutex);
#elif defined(NDS_NITRO)
os_unlock_mutex(&memtrack.mutex);
ret = 0;
#endif
if (ret)

View File

@ -1,221 +0,0 @@
/*
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
/****************************************************************************
*
* Module Title : yv12extend.c
*
* Description :
*
***************************************************************************/
/****************************************************************************
* Header Files
****************************************************************************/
#include "vpx_scale/yv12config.h"
#include "vpx_mem/vpx_mem.h"
#include <nitro.h>
#include <nitro/mi.h>
#include <nitro/itcm_begin.h>
//---- DMA Number
#define DMA_NO 3
/****************************************************************************
* Exports
****************************************************************************/
/****************************************************************************
*
****************************************************************************/
void
vp8_yv12_extend_frame_borders(YV12_BUFFER_CONFIG *ybf)
{
int i;
unsigned char *src_ptr1, *src_ptr2;
unsigned char *dest_ptr1, *dest_ptr2;
unsigned int Border;
int plane_stride;
int plane_height;
int plane_width;
/***********/
/* Y Plane */
/***********/
Border = ybf->border;
plane_stride = ybf->y_stride;
plane_height = ybf->y_height;
plane_width = ybf->y_width;
// copy the left and right most columns out
src_ptr1 = ybf->y_buffer;
src_ptr2 = src_ptr1 + plane_width - 1;
dest_ptr1 = src_ptr1 - Border;
dest_ptr2 = src_ptr2 + 1;
for (i = 0; i < plane_height; i++)
{
mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
src_ptr1 += plane_stride;
src_ptr2 += plane_stride;
dest_ptr1 += plane_stride;
dest_ptr2 += plane_stride;
}
// Now copy the top and bottom source lines into each line of the respective borders
src_ptr1 = ybf->y_buffer - Border;
src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
dest_ptr1 = src_ptr1 - (Border * plane_stride);
dest_ptr2 = src_ptr2 + plane_stride;
for (i = 0; i < (int)Border; i++)
{
mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
dest_ptr1 += plane_stride;
dest_ptr2 += plane_stride;
}
plane_stride /= 2;
plane_height /= 2;
plane_width /= 2;
Border /= 2;
/***********/
/* U Plane */
/***********/
// copy the left and right most columns out
src_ptr1 = ybf->u_buffer;
src_ptr2 = src_ptr1 + plane_width - 1;
dest_ptr1 = src_ptr1 - Border;
dest_ptr2 = src_ptr2 + 1;
for (i = 0; i < plane_height; i++)
{
mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
src_ptr1 += plane_stride;
src_ptr2 += plane_stride;
dest_ptr1 += plane_stride;
dest_ptr2 += plane_stride;
}
// Now copy the top and bottom source lines into each line of the respective borders
src_ptr1 = ybf->u_buffer - Border;
src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
dest_ptr1 = src_ptr1 - (Border * plane_stride);
dest_ptr2 = src_ptr2 + plane_stride;
for (i = 0; i < (int)(Border); i++)
{
mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
dest_ptr1 += plane_stride;
dest_ptr2 += plane_stride;
}
/***********/
/* V Plane */
/***********/
// copy the left and right most columns out
src_ptr1 = ybf->v_buffer;
src_ptr2 = src_ptr1 + plane_width - 1;
dest_ptr1 = src_ptr1 - Border;
dest_ptr2 = src_ptr2 + 1;
for (i = 0; i < plane_height; i++)
{
mi_cpu_fill8(dest_ptr1, src_ptr1[0], Border);
mi_cpu_fill8(dest_ptr2, src_ptr2[0], Border);
src_ptr1 += plane_stride;
src_ptr2 += plane_stride;
dest_ptr1 += plane_stride;
dest_ptr2 += plane_stride;
}
// Now copy the top and bottom source lines into each line of the respective borders
src_ptr1 = ybf->v_buffer - Border;
src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride;
dest_ptr1 = src_ptr1 - (Border * plane_stride);
dest_ptr2 = src_ptr2 + plane_stride;
for (i = 0; i < (int)(Border); i++)
{
mi_cpu_copy_fast(src_ptr1, dest_ptr1, plane_stride);
mi_cpu_copy_fast(src_ptr2, dest_ptr2, plane_stride);
dest_ptr1 += plane_stride;
dest_ptr2 += plane_stride;
}
}
/****************************************************************************
*
* ROUTINE : vp8_yv12_copy_frame
*
* INPUTS :
*
* OUTPUTS : None.
*
* RETURNS : void
*
* FUNCTION : Copies the source image into the destination image and
* updates the destination's UMV borders.
*
* SPECIAL NOTES : The frames are assumed to be identical in size.
*
****************************************************************************/
void
vp8_yv12_copy_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc)
{
int yplane_size = (src_ybc->y_height + 2 * src_ybc->border) * (src_ybc->y_stride);
int mem_size = (yplane_size * 3 / 2) + (src_ybc->y_stride * 2);
mi_cpu_copy_fast(src_ybc->buffer_alloc, dst_ybc->buffer_alloc, mem_size);
/* unsigned char *src_y, *dst_y;
unsigned char *src_u, *dst_u;
unsigned char *src_v, *dst_v;
int yheight, uv_height;
int ystride, uv_stride;
int border;
int yoffset, uvoffset;
border = src_ybc->border;
yheight = src_ybc->y_height;
uv_height = src_ybc->uv_height;
ystride = src_ybc->y_stride;
uv_stride = src_ybc->uv_stride;
yoffset = border * (ystride + 1);
uvoffset = border/2 * (uv_stride + 1);
src_y = src_ybc->y_buffer - yoffset;
dst_y = dst_ybc->y_buffer - yoffset;
src_u = src_ybc->u_buffer - uvoffset;
dst_u = dst_ybc->u_buffer - uvoffset;
src_v = src_ybc->v_buffer - uvoffset;
dst_v = dst_ybc->v_buffer - uvoffset;
mi_cpu_copy_fast (src_y, dst_y, ystride * (yheight + 2 * border));
mi_cpu_copy_fast (src_u, dst_u, uv_stride * (uv_height + border));
mi_cpu_copy_fast (src_v, dst_v, uv_stride * (uv_height + border));
*/
}
#include <nitro/itcm_end.h>

View File

@ -24,9 +24,12 @@ vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf)
{
if (ybf)
{
duck_free(ybf->buffer_alloc);
vpx_free(ybf->buffer_alloc);
ybf->buffer_alloc = 0;
/* buffer_alloc isn't accessed by most functions. Rather y_buffer,
u_buffer and v_buffer point to buffer_alloc and are used. Clear out
all of this so that a freed pointer isn't inadvertently used */
vpx_memset (ybf, 0, sizeof (YV12_BUFFER_CONFIG));
}
else
{
@ -44,38 +47,37 @@ vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int
{
/*NOTE:*/
int yplane_size = (height + 2 * border) * (width + 2 * border);
int uvplane_size = ((1 + height) / 2 + border) * ((1 + width) / 2 + border);
if (ybf)
{
int uv_width = width >> 1;
int uv_height = height >> 1;
int yplane_size = (height + 2 * border) * (width + 2 * border);
int uvplane_size = (uv_height + border) * (uv_width + border);
vp8_yv12_de_alloc_frame_buffer(ybf);
/* only support allocating buffers that have
a height and width that are multiples of 16 */
if ((width & 0xf) | (height & 0xf))
return -3;
ybf->y_width = width;
ybf->y_height = height;
ybf->y_stride = width + 2 * border;
ybf->uv_width = (1 + width) / 2;
ybf->uv_height = (1 + height) / 2;
ybf->uv_stride = ybf->uv_width + border;
ybf->uv_width = uv_width;
ybf->uv_height = uv_height;
ybf->uv_stride = uv_width + border;
ybf->border = border;
ybf->frame_size = yplane_size + 2 * uvplane_size;
/* Added 2 extra lines to framebuffer so that copy12x12 doesn't fail
* when we have a large motion vector in V on the last v block.
* Note : We never use these pixels anyway so this doesn't hurt.
*/
ybf->buffer_alloc = (unsigned char *) duck_memalign(32, ybf->frame_size + (ybf->y_stride * 2) + 32, 0);
ybf->buffer_alloc = (unsigned char *) vpx_memalign(32, ybf->frame_size);
if (ybf->buffer_alloc == NULL)
return -1;
ybf->y_buffer = ybf->buffer_alloc + (border * ybf->y_stride) + border;
if (yplane_size & 0xf)
yplane_size += 16 - (yplane_size & 0xf);
ybf->u_buffer = ybf->buffer_alloc + yplane_size + (border / 2 * ybf->uv_stride) + border / 2;
ybf->v_buffer = ybf->buffer_alloc + yplane_size + uvplane_size + (border / 2 * ybf->uv_stride) + border / 2;