d63 intra prediction ssse3 using bytes

byte version of ronalds d63 ssse3 optimizations
(commit: c5a1c8cf3541cf3665fee981b36d22c9fbd4191e)

Change-Id: Ifd3e6d454a2246085f23eabb38518a930321e807
This commit is contained in:
Scott LaVarnway 2013-09-25 16:16:44 -04:00
parent 682c27239f
commit 208658490c
2 changed files with 156 additions and 6 deletions

View File

@ -53,7 +53,7 @@ prototype void vp9_d45_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const ui
specialize vp9_d45_predictor_4x4 $ssse3_x86inc
prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d63_predictor_4x4
specialize vp9_d63_predictor_4x4 $ssse3_x86inc
prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_h_predictor_4x4 $ssse3_x86inc
@ -92,7 +92,7 @@ prototype void vp9_d45_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const ui
specialize vp9_d45_predictor_8x8 $ssse3_x86inc
prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d63_predictor_8x8
specialize vp9_d63_predictor_8x8 $ssse3_x86inc
prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_h_predictor_8x8 $ssse3_x86inc
@ -131,7 +131,7 @@ prototype void vp9_d45_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const
specialize vp9_d45_predictor_16x16 $ssse3_x86inc
prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d63_predictor_16x16
specialize vp9_d63_predictor_16x16 $ssse3_x86inc
prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_h_predictor_16x16 $ssse3_x86inc
@ -170,7 +170,7 @@ prototype void vp9_d45_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const
specialize vp9_d45_predictor_32x32 $ssse3_x86inc
prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_d63_predictor_32x32
specialize vp9_d63_predictor_32x32 $ssse3_x86inc
prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
specialize vp9_h_predictor_32x32 $ssse3 x86inc

View File

@ -17,8 +17,8 @@ pw_2: times 8 dw 2
pb_7m1: times 8 db 7, -1
pb_15: times 16 db 15
sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7
sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7
sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
@ -305,3 +305,153 @@ cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset
RESTORE_GOT
RET
; ------------------------------------------
; input: x, y, z, result
;
; trick from pascal
; (x+2y+z+2)>>2 can be calculated as:
; result = avg(x,z)
; result -= xor(x,z) & 1
; result = avg(result,y)
; ------------------------------------------
%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
pavgb %4, %1, %3
pxor %3, %1
pand %3, [GLOBAL(pb_1)]
psubb %4, %3
pavgb %4, %2
%endmacro
INIT_XMM ssse3
cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset
GET_GOT goffsetq
movq m3, [aboveq]
pshufb m1, m3, [GLOBAL(sh_b23456777)]
pshufb m2, m3, [GLOBAL(sh_b12345677)]
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4
pavgb m3, m2
; store 4 lines
movd [dstq ], m3
movd [dstq+strideq], m4
lea dstq, [dstq+strideq*2]
psrldq m3, 1
psrldq m4, 1
movd [dstq ], m3
movd [dstq+strideq], m4
RESTORE_GOT
RET
INIT_XMM ssse3
cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset
GET_GOT goffsetq
movq m3, [aboveq]
DEFINE_ARGS dst, stride, stride3
lea stride3q, [strideq*3]
pshufb m1, m3, [GLOBAL(sh_b2345677777777777)]
pshufb m0, m3, [GLOBAL(sh_b0123456777777777)]
pshufb m2, m3, [GLOBAL(sh_b1234567777777777)]
pshufb m3, [GLOBAL(sh_b0123456777777777)]
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4
pavgb m3, m2
; store 4 lines
movq [dstq ], m3
movq [dstq+strideq], m4
psrldq m3, 1
psrldq m4, 1
movq [dstq+strideq*2], m3
movq [dstq+stride3q ], m4
lea dstq, [dstq+strideq*4]
psrldq m3, 1
psrldq m4, 1
; store 4 lines
movq [dstq ], m3
movq [dstq+strideq], m4
psrldq m3, 1
psrldq m4, 1
movq [dstq+strideq*2], m3
movq [dstq+stride3q ], m4
RESTORE_GOT
RET
INIT_XMM ssse3
cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset
GET_GOT goffsetq
mova m0, [aboveq]
DEFINE_ARGS dst, stride, stride3, line
lea stride3q, [strideq*3]
mova m1, [GLOBAL(sh_b123456789abcdeff)]
pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)]
pshufb m3, m0, m1
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4
pavgb m0, m3
mov lined, 4
.loop:
mova [dstq ], m0
mova [dstq+strideq ], m4
pshufb m0, m1
pshufb m4, m1
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m4
pshufb m0, m1
pshufb m4, m1
lea dstq, [dstq+strideq*4]
dec lined
jnz .loop
RESTORE_GOT
REP_RET
INIT_XMM ssse3
cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset
GET_GOT goffsetq
mova m0, [aboveq]
mova m7, [aboveq+16]
DEFINE_ARGS dst, stride, stride3, line
mova m1, [GLOBAL(sh_b123456789abcdeff)]
lea stride3q, [strideq*3]
pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)]
pshufb m3, m7, m1
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4
palignr m6, m7, m0, 1
palignr m5, m7, m0, 2
pavgb m7, m3
X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2
pavgb m0, m6
mov lined, 8
.loop:
mova [dstq ], m0
mova [dstq +16], m7
mova [dstq+strideq ], m2
mova [dstq+strideq +16], m4
palignr m3, m7, m0, 1
palignr m5, m4, m2, 1
pshufb m7, m1
pshufb m4, m1
mova [dstq+strideq*2 ], m3
mova [dstq+strideq*2+16], m7
mova [dstq+stride3q ], m5
mova [dstq+stride3q +16], m4
palignr m0, m7, m3, 1
palignr m2, m4, m5, 1
pshufb m7, m1
pshufb m4, m1
lea dstq, [dstq+strideq*4]
dec lined
jnz .loop
RESTORE_GOT
REP_RET