Extends temporal filtering to work for 422 data

This is needed for profiles 1 and 2.

Change-Id: I5dd7644c2932d055ab89e050d4be7d4117cd1028
This commit is contained in:
Deb Mukherjee 2014-05-20 10:48:54 -07:00
parent 3946cdfdd4
commit a185bc3350
3 changed files with 52 additions and 46 deletions

View File

@ -772,7 +772,7 @@ $vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4;
add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv"; add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
specialize qw/vp9_full_range_search/; specialize qw/vp9_full_range_search/;
add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"; add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
specialize qw/vp9_temporal_filter_apply sse2/; specialize qw/vp9_temporal_filter_apply sse2/;
} }

View File

@ -34,7 +34,8 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
uint8_t *u_mb_ptr, uint8_t *u_mb_ptr,
uint8_t *v_mb_ptr, uint8_t *v_mb_ptr,
int stride, int stride,
int uv_block_size, int uv_block_width,
int uv_block_height,
int mv_row, int mv_row,
int mv_col, int mv_col,
uint8_t *pred, uint8_t *pred,
@ -47,7 +48,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
enum mv_precision mv_precision_uv; enum mv_precision mv_precision_uv;
int uv_stride; int uv_stride;
if (uv_block_size == 8) { if (uv_block_width == 8) {
uv_stride = (stride + 1) >> 1; uv_stride = (stride + 1) >> 1;
mv_precision_uv = MV_PRECISION_Q4; mv_precision_uv = MV_PRECISION_Q4;
} else { } else {
@ -64,18 +65,18 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
kernel, MV_PRECISION_Q3, x, y); kernel, MV_PRECISION_Q3, x, y);
vp9_build_inter_predictor(u_mb_ptr, uv_stride, vp9_build_inter_predictor(u_mb_ptr, uv_stride,
&pred[256], uv_block_size, &pred[256], uv_block_width,
&mv, &mv,
scale, scale,
uv_block_size, uv_block_size, uv_block_width, uv_block_height,
which_mv, which_mv,
kernel, mv_precision_uv, x, y); kernel, mv_precision_uv, x, y);
vp9_build_inter_predictor(v_mb_ptr, uv_stride, vp9_build_inter_predictor(v_mb_ptr, uv_stride,
&pred[512], uv_block_size, &pred[512], uv_block_width,
&mv, &mv,
scale, scale,
uv_block_size, uv_block_size, uv_block_width, uv_block_height,
which_mv, which_mv,
kernel, mv_precision_uv, x, y); kernel, mv_precision_uv, x, y);
} }
@ -91,7 +92,8 @@ void vp9_temporal_filter_init() {
void vp9_temporal_filter_apply_c(uint8_t *frame1, void vp9_temporal_filter_apply_c(uint8_t *frame1,
unsigned int stride, unsigned int stride,
uint8_t *frame2, uint8_t *frame2,
unsigned int block_size, unsigned int block_width,
unsigned int block_height,
int strength, int strength,
int filter_weight, int filter_weight,
unsigned int *accumulator, unsigned int *accumulator,
@ -101,8 +103,8 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
int byte = 0; int byte = 0;
const int rounding = strength > 0 ? 1 << (strength - 1) : 0; const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
for (i = 0, k = 0; i < block_size; i++) { for (i = 0, k = 0; i < block_height; i++) {
for (j = 0; j < block_size; j++, k++) { for (j = 0; j < block_width; j++, k++) {
int src_byte = frame1[byte]; int src_byte = frame1[byte];
int pixel_value = *frame2++; int pixel_value = *frame2++;
@ -127,7 +129,7 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
byte++; byte++;
} }
byte += stride - block_size; byte += stride - block_width;
} }
} }
@ -204,14 +206,12 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
uint8_t *dst1, *dst2; uint8_t *dst1, *dst2;
DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3); DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3);
const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
// Save input state // Save input state
uint8_t* input_buffer[MAX_MB_PLANE]; uint8_t* input_buffer[MAX_MB_PLANE];
int i; int i;
// TODO(aconverse): Add 4:2:2 support
assert(mbd->plane[1].subsampling_x == mbd->plane[1].subsampling_y);
for (i = 0; i < MAX_MB_PLANE; i++) for (i = 0; i < MAX_MB_PLANE; i++)
input_buffer[i] = mbd->plane[i].pre[0].buf; input_buffer[i] = mbd->plane[i].pre[0].buf;
@ -275,7 +275,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
cpi->frames[frame]->u_buffer + mb_uv_offset, cpi->frames[frame]->u_buffer + mb_uv_offset,
cpi->frames[frame]->v_buffer + mb_uv_offset, cpi->frames[frame]->v_buffer + mb_uv_offset,
cpi->frames[frame]->y_stride, cpi->frames[frame]->y_stride,
mb_uv_height, mb_uv_width, mb_uv_height,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row, mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
mbd->mi[0]->bmi[0].as_mv[0].as_mv.col, mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
predictor, scale, predictor, scale,
@ -283,16 +283,17 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
// Apply the filter (YUV) // Apply the filter (YUV)
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
predictor, 16, strength, filter_weight, predictor, 16, 16,
strength, filter_weight,
accumulator, count); accumulator, count);
vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
predictor + 256, mb_uv_height, strength, predictor + 256,
mb_uv_width, mb_uv_height, strength,
filter_weight, accumulator + 256, filter_weight, accumulator + 256,
count + 256); count + 256);
vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
predictor + 512, mb_uv_height, strength, predictor + 512,
mb_uv_width, mb_uv_height, strength,
filter_weight, accumulator + 512, filter_weight, accumulator + 512,
count + 512); count + 512);
} }
@ -321,7 +322,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
stride = cpi->alt_ref_buffer.uv_stride; stride = cpi->alt_ref_buffer.uv_stride;
byte = mb_uv_offset; byte = mb_uv_offset;
for (i = 0, k = 256; i < mb_uv_height; i++) { for (i = 0, k = 256; i < mb_uv_height; i++) {
for (j = 0; j < mb_uv_height; j++, k++) { for (j = 0; j < mb_uv_width; j++, k++) {
int m = k + 256; int m = k + 256;
// U // U
@ -339,13 +340,13 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
// move to next pixel // move to next pixel
byte++; byte++;
} }
byte += stride - mb_uv_height; byte += stride - mb_uv_width;
} }
mb_y_offset += 16; mb_y_offset += 16;
mb_uv_offset += mb_uv_height; mb_uv_offset += mb_uv_width;
} }
mb_y_offset += 16 * (f->y_stride - mb_cols); mb_y_offset += 16 * (f->y_stride - mb_cols);
mb_uv_offset += mb_uv_height * (f->uv_stride - mb_cols); mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
} }
// Restore input state // Restore input state

View File

@ -15,41 +15,45 @@
; (unsigned char *frame1, | 0 ; (unsigned char *frame1, | 0
; unsigned int stride, | 1 ; unsigned int stride, | 1
; unsigned char *frame2, | 2 ; unsigned char *frame2, | 2
; unsigned int block_size, | 3 ; unsigned int block_width, | 3
; int strength, | 4 ; unsigned int block_height, | 4
; int filter_weight, | 5 ; int strength, | 5
; unsigned int *accumulator, | 6 ; int filter_weight, | 6
; unsigned short *count) | 7 ; unsigned int *accumulator, | 7
; unsigned short *count) | 8
global sym(vp9_temporal_filter_apply_sse2) PRIVATE global sym(vp9_temporal_filter_apply_sse2) PRIVATE
sym(vp9_temporal_filter_apply_sse2): sym(vp9_temporal_filter_apply_sse2):
push rbp push rbp
mov rbp, rsp mov rbp, rsp
SHADOW_ARGS_TO_STACK 8 SHADOW_ARGS_TO_STACK 9
SAVE_XMM 7 SAVE_XMM 7
GET_GOT rbx GET_GOT rbx
push rsi push rsi
push rdi push rdi
ALIGN_STACK 16, rax ALIGN_STACK 16, rax
%define block_size 0 %define block_width 0
%define strength 16 %define block_height 16
%define filter_weight 32 %define strength 32
%define rounding_bit 48 %define filter_weight 48
%define rbp_backup 64 %define rounding_bit 64
%define stack_size 80 %define rbp_backup 80
%define stack_size 96
sub rsp, stack_size sub rsp, stack_size
mov [rsp + rbp_backup], rbp mov [rsp + rbp_backup], rbp
; end prolog ; end prolog
mov rdx, arg(3) mov rdx, arg(3)
mov [rsp + block_size], rdx mov [rsp + block_width], rdx
movd xmm6, arg(4) mov rdx, arg(4)
mov [rsp + block_height], rdx
movd xmm6, arg(5)
movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
; calculate the rounding bit outside the loop ; calculate the rounding bit outside the loop
; 0x8000 >> (16 - strength) ; 0x8000 >> (16 - strength)
mov rdx, 16 mov rdx, 16
sub rdx, arg(4) ; 16 - strength sub rdx, arg(5) ; 16 - strength
movq xmm4, rdx ; can't use rdx w/ shift movq xmm4, rdx ; can't use rdx w/ shift
movdqa xmm5, [GLOBAL(_const_top_bit)] movdqa xmm5, [GLOBAL(_const_top_bit)]
psrlw xmm5, xmm4 psrlw xmm5, xmm4
@ -57,11 +61,11 @@ sym(vp9_temporal_filter_apply_sse2):
mov rsi, arg(0) ; src/frame1 mov rsi, arg(0) ; src/frame1
mov rdx, arg(2) ; predictor frame mov rdx, arg(2) ; predictor frame
mov rdi, arg(6) ; accumulator mov rdi, arg(7) ; accumulator
mov rax, arg(7) ; count mov rax, arg(8) ; count
; dup the filter weight and store for later ; dup the filter weight and store for later
movd xmm0, arg(5) ; filter_weight movd xmm0, arg(6) ; filter_weight
pshuflw xmm0, xmm0, 0 pshuflw xmm0, xmm0, 0
punpcklwd xmm0, xmm0 punpcklwd xmm0, xmm0
movdqa [rsp + filter_weight], xmm0 movdqa [rsp + filter_weight], xmm0
@ -69,10 +73,11 @@ sym(vp9_temporal_filter_apply_sse2):
mov rbp, arg(1) ; stride mov rbp, arg(1) ; stride
pxor xmm7, xmm7 ; zero for extraction pxor xmm7, xmm7 ; zero for extraction
lea rcx, [rdx + 16*16*1] mov rcx, [rsp + block_width]
cmp dword ptr [rsp + block_size], 8 imul rcx, [rsp + block_height]
add rcx, rdx
cmp dword ptr [rsp + block_width], 8
jne .temporal_filter_apply_load_16 jne .temporal_filter_apply_load_16
lea rcx, [rdx + 8*8*1]
.temporal_filter_apply_load_8: .temporal_filter_apply_load_8:
movq xmm0, [rsi] ; first row movq xmm0, [rsi] ; first row
@ -178,7 +183,7 @@ sym(vp9_temporal_filter_apply_sse2):
cmp rdx, rcx cmp rdx, rcx
je .temporal_filter_apply_epilog je .temporal_filter_apply_epilog
pxor xmm7, xmm7 ; zero for extraction pxor xmm7, xmm7 ; zero for extraction
cmp dword ptr [rsp + block_size], 16 cmp dword ptr [rsp + block_width], 16
je .temporal_filter_apply_load_16 je .temporal_filter_apply_load_16
jmp .temporal_filter_apply_load_8 jmp .temporal_filter_apply_load_8