Extends temporal filtering to work for 422 data
This is needed for profiles 1 and 2. Change-Id: I5dd7644c2932d055ab89e050d4be7d4117cd1028
This commit is contained in:
parent
3946cdfdd4
commit
a185bc3350
@ -772,7 +772,7 @@ $vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4;
|
|||||||
add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
|
add_proto qw/int vp9_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
|
||||||
specialize qw/vp9_full_range_search/;
|
specialize qw/vp9_full_range_search/;
|
||||||
|
|
||||||
add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
|
add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
|
||||||
specialize qw/vp9_temporal_filter_apply sse2/;
|
specialize qw/vp9_temporal_filter_apply sse2/;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -34,7 +34,8 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
|
|||||||
uint8_t *u_mb_ptr,
|
uint8_t *u_mb_ptr,
|
||||||
uint8_t *v_mb_ptr,
|
uint8_t *v_mb_ptr,
|
||||||
int stride,
|
int stride,
|
||||||
int uv_block_size,
|
int uv_block_width,
|
||||||
|
int uv_block_height,
|
||||||
int mv_row,
|
int mv_row,
|
||||||
int mv_col,
|
int mv_col,
|
||||||
uint8_t *pred,
|
uint8_t *pred,
|
||||||
@ -47,7 +48,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
|
|||||||
|
|
||||||
enum mv_precision mv_precision_uv;
|
enum mv_precision mv_precision_uv;
|
||||||
int uv_stride;
|
int uv_stride;
|
||||||
if (uv_block_size == 8) {
|
if (uv_block_width == 8) {
|
||||||
uv_stride = (stride + 1) >> 1;
|
uv_stride = (stride + 1) >> 1;
|
||||||
mv_precision_uv = MV_PRECISION_Q4;
|
mv_precision_uv = MV_PRECISION_Q4;
|
||||||
} else {
|
} else {
|
||||||
@ -64,18 +65,18 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
|
|||||||
kernel, MV_PRECISION_Q3, x, y);
|
kernel, MV_PRECISION_Q3, x, y);
|
||||||
|
|
||||||
vp9_build_inter_predictor(u_mb_ptr, uv_stride,
|
vp9_build_inter_predictor(u_mb_ptr, uv_stride,
|
||||||
&pred[256], uv_block_size,
|
&pred[256], uv_block_width,
|
||||||
&mv,
|
&mv,
|
||||||
scale,
|
scale,
|
||||||
uv_block_size, uv_block_size,
|
uv_block_width, uv_block_height,
|
||||||
which_mv,
|
which_mv,
|
||||||
kernel, mv_precision_uv, x, y);
|
kernel, mv_precision_uv, x, y);
|
||||||
|
|
||||||
vp9_build_inter_predictor(v_mb_ptr, uv_stride,
|
vp9_build_inter_predictor(v_mb_ptr, uv_stride,
|
||||||
&pred[512], uv_block_size,
|
&pred[512], uv_block_width,
|
||||||
&mv,
|
&mv,
|
||||||
scale,
|
scale,
|
||||||
uv_block_size, uv_block_size,
|
uv_block_width, uv_block_height,
|
||||||
which_mv,
|
which_mv,
|
||||||
kernel, mv_precision_uv, x, y);
|
kernel, mv_precision_uv, x, y);
|
||||||
}
|
}
|
||||||
@ -91,7 +92,8 @@ void vp9_temporal_filter_init() {
|
|||||||
void vp9_temporal_filter_apply_c(uint8_t *frame1,
|
void vp9_temporal_filter_apply_c(uint8_t *frame1,
|
||||||
unsigned int stride,
|
unsigned int stride,
|
||||||
uint8_t *frame2,
|
uint8_t *frame2,
|
||||||
unsigned int block_size,
|
unsigned int block_width,
|
||||||
|
unsigned int block_height,
|
||||||
int strength,
|
int strength,
|
||||||
int filter_weight,
|
int filter_weight,
|
||||||
unsigned int *accumulator,
|
unsigned int *accumulator,
|
||||||
@ -101,8 +103,8 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
|
|||||||
int byte = 0;
|
int byte = 0;
|
||||||
const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
|
const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
|
||||||
|
|
||||||
for (i = 0, k = 0; i < block_size; i++) {
|
for (i = 0, k = 0; i < block_height; i++) {
|
||||||
for (j = 0; j < block_size; j++, k++) {
|
for (j = 0; j < block_width; j++, k++) {
|
||||||
int src_byte = frame1[byte];
|
int src_byte = frame1[byte];
|
||||||
int pixel_value = *frame2++;
|
int pixel_value = *frame2++;
|
||||||
|
|
||||||
@ -127,7 +129,7 @@ void vp9_temporal_filter_apply_c(uint8_t *frame1,
|
|||||||
byte++;
|
byte++;
|
||||||
}
|
}
|
||||||
|
|
||||||
byte += stride - block_size;
|
byte += stride - block_width;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -204,14 +206,12 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
|
|||||||
uint8_t *dst1, *dst2;
|
uint8_t *dst1, *dst2;
|
||||||
DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3);
|
DECLARE_ALIGNED_ARRAY(16, uint8_t, predictor, 16 * 16 * 3);
|
||||||
const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
|
const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
|
||||||
|
const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
|
||||||
|
|
||||||
// Save input state
|
// Save input state
|
||||||
uint8_t* input_buffer[MAX_MB_PLANE];
|
uint8_t* input_buffer[MAX_MB_PLANE];
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
// TODO(aconverse): Add 4:2:2 support
|
|
||||||
assert(mbd->plane[1].subsampling_x == mbd->plane[1].subsampling_y);
|
|
||||||
|
|
||||||
for (i = 0; i < MAX_MB_PLANE; i++)
|
for (i = 0; i < MAX_MB_PLANE; i++)
|
||||||
input_buffer[i] = mbd->plane[i].pre[0].buf;
|
input_buffer[i] = mbd->plane[i].pre[0].buf;
|
||||||
|
|
||||||
@ -275,7 +275,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
|
|||||||
cpi->frames[frame]->u_buffer + mb_uv_offset,
|
cpi->frames[frame]->u_buffer + mb_uv_offset,
|
||||||
cpi->frames[frame]->v_buffer + mb_uv_offset,
|
cpi->frames[frame]->v_buffer + mb_uv_offset,
|
||||||
cpi->frames[frame]->y_stride,
|
cpi->frames[frame]->y_stride,
|
||||||
mb_uv_height,
|
mb_uv_width, mb_uv_height,
|
||||||
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
|
mbd->mi[0]->bmi[0].as_mv[0].as_mv.row,
|
||||||
mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
|
mbd->mi[0]->bmi[0].as_mv[0].as_mv.col,
|
||||||
predictor, scale,
|
predictor, scale,
|
||||||
@ -283,16 +283,17 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
|
|||||||
|
|
||||||
// Apply the filter (YUV)
|
// Apply the filter (YUV)
|
||||||
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
|
vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
|
||||||
predictor, 16, strength, filter_weight,
|
predictor, 16, 16,
|
||||||
|
strength, filter_weight,
|
||||||
accumulator, count);
|
accumulator, count);
|
||||||
|
|
||||||
vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
|
vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
|
||||||
predictor + 256, mb_uv_height, strength,
|
predictor + 256,
|
||||||
|
mb_uv_width, mb_uv_height, strength,
|
||||||
filter_weight, accumulator + 256,
|
filter_weight, accumulator + 256,
|
||||||
count + 256);
|
count + 256);
|
||||||
|
|
||||||
vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
|
vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
|
||||||
predictor + 512, mb_uv_height, strength,
|
predictor + 512,
|
||||||
|
mb_uv_width, mb_uv_height, strength,
|
||||||
filter_weight, accumulator + 512,
|
filter_weight, accumulator + 512,
|
||||||
count + 512);
|
count + 512);
|
||||||
}
|
}
|
||||||
@ -321,7 +322,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
|
|||||||
stride = cpi->alt_ref_buffer.uv_stride;
|
stride = cpi->alt_ref_buffer.uv_stride;
|
||||||
byte = mb_uv_offset;
|
byte = mb_uv_offset;
|
||||||
for (i = 0, k = 256; i < mb_uv_height; i++) {
|
for (i = 0, k = 256; i < mb_uv_height; i++) {
|
||||||
for (j = 0; j < mb_uv_height; j++, k++) {
|
for (j = 0; j < mb_uv_width; j++, k++) {
|
||||||
int m = k + 256;
|
int m = k + 256;
|
||||||
|
|
||||||
// U
|
// U
|
||||||
@ -339,13 +340,13 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi,
|
|||||||
// move to next pixel
|
// move to next pixel
|
||||||
byte++;
|
byte++;
|
||||||
}
|
}
|
||||||
byte += stride - mb_uv_height;
|
byte += stride - mb_uv_width;
|
||||||
}
|
}
|
||||||
mb_y_offset += 16;
|
mb_y_offset += 16;
|
||||||
mb_uv_offset += mb_uv_height;
|
mb_uv_offset += mb_uv_width;
|
||||||
}
|
}
|
||||||
mb_y_offset += 16 * (f->y_stride - mb_cols);
|
mb_y_offset += 16 * (f->y_stride - mb_cols);
|
||||||
mb_uv_offset += mb_uv_height * (f->uv_stride - mb_cols);
|
mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Restore input state
|
// Restore input state
|
||||||
|
@ -15,41 +15,45 @@
|
|||||||
; (unsigned char *frame1, | 0
|
; (unsigned char *frame1, | 0
|
||||||
; unsigned int stride, | 1
|
; unsigned int stride, | 1
|
||||||
; unsigned char *frame2, | 2
|
; unsigned char *frame2, | 2
|
||||||
; unsigned int block_size, | 3
|
; unsigned int block_width, | 3
|
||||||
; int strength, | 4
|
; unsigned int block_height, | 4
|
||||||
; int filter_weight, | 5
|
; int strength, | 5
|
||||||
; unsigned int *accumulator, | 6
|
; int filter_weight, | 6
|
||||||
; unsigned short *count) | 7
|
; unsigned int *accumulator, | 7
|
||||||
|
; unsigned short *count) | 8
|
||||||
global sym(vp9_temporal_filter_apply_sse2) PRIVATE
|
global sym(vp9_temporal_filter_apply_sse2) PRIVATE
|
||||||
sym(vp9_temporal_filter_apply_sse2):
|
sym(vp9_temporal_filter_apply_sse2):
|
||||||
|
|
||||||
push rbp
|
push rbp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
SHADOW_ARGS_TO_STACK 8
|
SHADOW_ARGS_TO_STACK 9
|
||||||
SAVE_XMM 7
|
SAVE_XMM 7
|
||||||
GET_GOT rbx
|
GET_GOT rbx
|
||||||
push rsi
|
push rsi
|
||||||
push rdi
|
push rdi
|
||||||
ALIGN_STACK 16, rax
|
ALIGN_STACK 16, rax
|
||||||
%define block_size 0
|
%define block_width 0
|
||||||
%define strength 16
|
%define block_height 16
|
||||||
%define filter_weight 32
|
%define strength 32
|
||||||
%define rounding_bit 48
|
%define filter_weight 48
|
||||||
%define rbp_backup 64
|
%define rounding_bit 64
|
||||||
%define stack_size 80
|
%define rbp_backup 80
|
||||||
|
%define stack_size 96
|
||||||
sub rsp, stack_size
|
sub rsp, stack_size
|
||||||
mov [rsp + rbp_backup], rbp
|
mov [rsp + rbp_backup], rbp
|
||||||
; end prolog
|
; end prolog
|
||||||
|
|
||||||
mov rdx, arg(3)
|
mov rdx, arg(3)
|
||||||
mov [rsp + block_size], rdx
|
mov [rsp + block_width], rdx
|
||||||
movd xmm6, arg(4)
|
mov rdx, arg(4)
|
||||||
|
mov [rsp + block_height], rdx
|
||||||
|
movd xmm6, arg(5)
|
||||||
movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
|
movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
|
||||||
|
|
||||||
; calculate the rounding bit outside the loop
|
; calculate the rounding bit outside the loop
|
||||||
; 0x8000 >> (16 - strength)
|
; 0x8000 >> (16 - strength)
|
||||||
mov rdx, 16
|
mov rdx, 16
|
||||||
sub rdx, arg(4) ; 16 - strength
|
sub rdx, arg(5) ; 16 - strength
|
||||||
movq xmm4, rdx ; can't use rdx w/ shift
|
movq xmm4, rdx ; can't use rdx w/ shift
|
||||||
movdqa xmm5, [GLOBAL(_const_top_bit)]
|
movdqa xmm5, [GLOBAL(_const_top_bit)]
|
||||||
psrlw xmm5, xmm4
|
psrlw xmm5, xmm4
|
||||||
@ -57,11 +61,11 @@ sym(vp9_temporal_filter_apply_sse2):
|
|||||||
|
|
||||||
mov rsi, arg(0) ; src/frame1
|
mov rsi, arg(0) ; src/frame1
|
||||||
mov rdx, arg(2) ; predictor frame
|
mov rdx, arg(2) ; predictor frame
|
||||||
mov rdi, arg(6) ; accumulator
|
mov rdi, arg(7) ; accumulator
|
||||||
mov rax, arg(7) ; count
|
mov rax, arg(8) ; count
|
||||||
|
|
||||||
; dup the filter weight and store for later
|
; dup the filter weight and store for later
|
||||||
movd xmm0, arg(5) ; filter_weight
|
movd xmm0, arg(6) ; filter_weight
|
||||||
pshuflw xmm0, xmm0, 0
|
pshuflw xmm0, xmm0, 0
|
||||||
punpcklwd xmm0, xmm0
|
punpcklwd xmm0, xmm0
|
||||||
movdqa [rsp + filter_weight], xmm0
|
movdqa [rsp + filter_weight], xmm0
|
||||||
@ -69,10 +73,11 @@ sym(vp9_temporal_filter_apply_sse2):
|
|||||||
mov rbp, arg(1) ; stride
|
mov rbp, arg(1) ; stride
|
||||||
pxor xmm7, xmm7 ; zero for extraction
|
pxor xmm7, xmm7 ; zero for extraction
|
||||||
|
|
||||||
lea rcx, [rdx + 16*16*1]
|
mov rcx, [rsp + block_width]
|
||||||
cmp dword ptr [rsp + block_size], 8
|
imul rcx, [rsp + block_height]
|
||||||
|
add rcx, rdx
|
||||||
|
cmp dword ptr [rsp + block_width], 8
|
||||||
jne .temporal_filter_apply_load_16
|
jne .temporal_filter_apply_load_16
|
||||||
lea rcx, [rdx + 8*8*1]
|
|
||||||
|
|
||||||
.temporal_filter_apply_load_8:
|
.temporal_filter_apply_load_8:
|
||||||
movq xmm0, [rsi] ; first row
|
movq xmm0, [rsi] ; first row
|
||||||
@ -178,7 +183,7 @@ sym(vp9_temporal_filter_apply_sse2):
|
|||||||
cmp rdx, rcx
|
cmp rdx, rcx
|
||||||
je .temporal_filter_apply_epilog
|
je .temporal_filter_apply_epilog
|
||||||
pxor xmm7, xmm7 ; zero for extraction
|
pxor xmm7, xmm7 ; zero for extraction
|
||||||
cmp dword ptr [rsp + block_size], 16
|
cmp dword ptr [rsp + block_width], 16
|
||||||
je .temporal_filter_apply_load_16
|
je .temporal_filter_apply_load_16
|
||||||
jmp .temporal_filter_apply_load_8
|
jmp .temporal_filter_apply_load_8
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user