Bug fix: ssse3 version of subpixel did not match C code

A 16 bit overflow condition occurs when using the EIGHTTAP_SMOOTH filters.
(vp9_sub_pel_filters_8lp)  Changed the order of the adds to fix this problem.
Also added ssse3 support for 4x4 subpixel filtering.

Change-Id: I475eaadae920794c2de5e01e9735c059a856518e
This commit is contained in:
Scott LaVarnway 2013-02-09 15:15:14 -08:00
parent 6dfc95fe63
commit eda30b410e
2 changed files with 265 additions and 13 deletions

View File

@ -65,6 +65,20 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
unsigned int out_pitch,
unsigned int output_height,
const short *filter);
void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int16_t *filter_x, int x_step_q4,
@ -87,6 +101,14 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
dst += 8;
w -= 8;
}
while (w >= 4) {
vp9_filter_block1d4_h8_ssse3(src, src_stride,
dst, dst_stride,
h, filter_x);
src += 4;
dst += 4;
w -= 4;
}
}
if (w) {
vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
@ -117,6 +139,14 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
dst += 8;
w -= 8;
}
while (w >= 4) {
vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
dst, dst_stride,
h, filter_y);
src += 4;
dst += 4;
w -= 4;
}
}
if (w) {
vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
@ -156,6 +186,15 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
h, filter_y);
return;
}
if (w == 4) {
vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
fdata2, 16,
h + 7, filter_x);
vp9_filter_block1d4_v8_ssse3(fdata2, 16,
dst, dst_stride,
h, filter_y);
return;
}
}
vp9_convolve8_c(src, src_stride, dst, dst_stride,
filter_x, x_step_q4, filter_y, y_step_q4,

View File

@ -21,6 +21,124 @@
;
;*************************************************************************************/
;void vp9_filter_block1d8_v8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pitch,
; unsigned char *output_ptr,
; unsigned int out_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
sym(vp9_filter_block1d4_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
push rsi
push rdi
push rbx
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
movd xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
pshuflw xmm2, xmm4, 10101010b ;k4_k5
pshuflw xmm3, xmm4, 11111111b ;k6_k7
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
movdqa k0k1, xmm0
movdqa k2k3, xmm1
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
movdqa krd, xmm5
movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
%if ABI_IS_32BIT=0
movsxd r8, DWORD PTR arg(3) ;out_pitch
%endif
mov rax, rsi
movsxd rcx, DWORD PTR arg(4) ;output_height
add rax, rdx
lea rbx, [rdx + rdx*4]
add rbx, rdx ;pitch * 6
.vp9_filter_block1d4_v8_ssse3_loop:
movd xmm0, [rsi] ;A
movd xmm1, [rsi + rdx] ;B
movd xmm2, [rsi + rdx * 2] ;C
movd xmm3, [rax + rdx * 2] ;D
movd xmm4, [rsi + rdx * 4] ;E
movd xmm5, [rax + rdx * 4] ;F
punpcklbw xmm0, xmm1 ;A B
punpcklbw xmm2, xmm3 ;C D
punpcklbw xmm4, xmm5 ;E F
movd xmm6, [rsi + rbx] ;G
movd xmm7, [rax + rbx] ;H
pmaddubsw xmm0, k0k1
pmaddubsw xmm2, k2k3
punpcklbw xmm6, xmm7 ;G H
pmaddubsw xmm4, k4k5
pmaddubsw xmm6, k6k7
paddsw xmm0, xmm2
paddsw xmm0, krd
paddsw xmm4, xmm6
paddsw xmm0, xmm4
psraw xmm0, 7
packuswb xmm0, xmm0
add rsi, rdx
add rax, rdx
movd [rdi], xmm0
%if ABI_IS_32BIT
add rdi, DWORD PTR arg(3) ;out_pitch
%else
add rdi, r8
%endif
dec rcx
jnz .vp9_filter_block1d4_v8_ssse3_loop
add rsp, 16*5
pop rsp
pop rbx
; begin epilog
pop rdi
pop rsi
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d8_v8_ssse3
;(
; unsigned char *src_ptr,
@ -289,6 +407,110 @@ sym(vp9_filter_block1d16_v8_ssse3):
pop rbp
ret
;void vp9_filter_block1d4_h8_ssse3
;(
; unsigned char *src_ptr,
; unsigned int src_pixels_per_line,
; unsigned char *output_ptr,
; unsigned int output_pitch,
; unsigned int output_height,
; short *filter
;)
global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
sym(vp9_filter_block1d4_h8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
GET_GOT rbx
push rsi
push rdi
; end prolog
ALIGN_STACK 16, rax
sub rsp, 16*5
%define k0k1 [rsp + 16*0]
%define k2k3 [rsp + 16*1]
%define k4k5 [rsp + 16*2]
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
mov rcx, 0x0400040
movdqa xmm4, [rdx] ;load filters
movd xmm5, rcx
packsswb xmm4, xmm4
pshuflw xmm0, xmm4, 0b ;k0_k1
pshuflw xmm1, xmm4, 01010101b ;k2_k3
pshuflw xmm2, xmm4, 10101010b ;k4_k5
pshuflw xmm3, xmm4, 11111111b ;k6_k7
punpcklqdq xmm0, xmm0
punpcklqdq xmm1, xmm1
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
movdqa k0k1, xmm0
movdqa k2k3, xmm1
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
movdqa krd, xmm5
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
.filter_block1d4_h8_rowloop_ssse3:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
punpcklqdq xmm0, xmm3
movdqa xmm1, xmm0
pshufb xmm0, [GLOBAL(shuf_t0t1)]
pmaddubsw xmm0, k0k1
movdqa xmm2, xmm1
pshufb xmm1, [GLOBAL(shuf_t2t3)]
pmaddubsw xmm1, k2k3
movdqa xmm4, xmm2
pshufb xmm2, [GLOBAL(shuf_t4t5)]
pmaddubsw xmm2, k4k5
pshufb xmm4, [GLOBAL(shuf_t6t7)]
pmaddubsw xmm4, k6k7
paddsw xmm0, xmm1
paddsw xmm0, xmm4
paddsw xmm0, xmm2
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
lea rsi, [rsi + rax]
movd [rdi], xmm0
lea rdi, [rdi + rdx]
dec rcx
jnz .filter_block1d4_h8_rowloop_ssse3
add rsp, 16*5
pop rsp
; begin epilog
pop rdi
pop rsi
RESTORE_GOT
RESTORE_XMM
UNSHADOW_ARGS
pop rbp
ret
;void vp9_filter_block1d8_h8_ssse3
;(
; unsigned char *src_ptr,
@ -340,7 +562,7 @@ sym(vp9_filter_block1d8_h8_ssse3):
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
; movdqa krd, xmm5
movdqa krd, xmm5
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
@ -349,10 +571,7 @@ sym(vp9_filter_block1d8_h8_ssse3):
.filter_block1d8_h8_rowloop_ssse3:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
;note: if we create a k0_k7 filter, we can save a pshufb
; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
punpcklqdq xmm0, xmm3
movdqa xmm1, xmm0
@ -371,9 +590,9 @@ sym(vp9_filter_block1d8_h8_ssse3):
pmaddubsw xmm4, k6k7
paddsw xmm0, xmm1
paddsw xmm0, xmm2
paddsw xmm0, xmm5
paddsw xmm0, xmm4
paddsw xmm0, xmm2
paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
@ -456,10 +675,7 @@ sym(vp9_filter_block1d16_h8_ssse3):
.filter_block1d16_h8_rowloop_ssse3:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
;note: if we create a k0_k7 filter, we can save a pshufb
; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
punpcklqdq xmm0, xmm3
movdqa xmm1, xmm0
@ -486,10 +702,7 @@ sym(vp9_filter_block1d16_h8_ssse3):
movq xmm3, [rsi + 5]
; movq xmm7, [rsi + 12]
movq xmm7, [rsi + 13]
;note: same as above
; punpcklbw xmm3, xmm7
punpcklqdq xmm3, xmm7
movdqa xmm1, xmm3
@ -508,9 +721,9 @@ sym(vp9_filter_block1d16_h8_ssse3):
pmaddubsw xmm4, k6k7
paddsw xmm3, xmm1
paddsw xmm3, xmm4
paddsw xmm3, xmm2
paddsw xmm3, krd
paddsw xmm3, xmm4
psraw xmm3, 7
packuswb xmm3, xmm3
punpcklqdq xmm0, xmm3