Merge "Remove usage of predict buffer for decode"
This commit is contained in:
commit
63a77cbed9
@ -45,7 +45,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
|
||||
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_armv6;
|
||||
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_armv6;
|
||||
|
||||
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_v6;
|
||||
rtcd->idct.idct16 = vp8_short_idct4x4llm_v6_dual;
|
||||
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_v6;
|
||||
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_v6;
|
||||
@ -64,9 +63,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
|
||||
rtcd->recon.copy16x16 = vp8_copy_mem16x16_v6;
|
||||
rtcd->recon.copy8x8 = vp8_copy_mem8x8_v6;
|
||||
rtcd->recon.copy8x4 = vp8_copy_mem8x4_v6;
|
||||
rtcd->recon.recon = vp8_recon_b_armv6;
|
||||
rtcd->recon.recon2 = vp8_recon2b_armv6;
|
||||
rtcd->recon.recon4 = vp8_recon4b_armv6;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -82,7 +78,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
|
||||
rtcd->subpix.bilinear8x4 = vp8_bilinear_predict8x4_neon;
|
||||
rtcd->subpix.bilinear4x4 = vp8_bilinear_predict4x4_neon;
|
||||
|
||||
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_neon;
|
||||
rtcd->idct.idct16 = vp8_short_idct4x4llm_neon;
|
||||
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_neon;
|
||||
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_neon;
|
||||
@ -99,10 +94,6 @@ void vp8_arch_arm_common_init(VP8_COMMON *ctx)
|
||||
rtcd->recon.copy16x16 = vp8_copy_mem16x16_neon;
|
||||
rtcd->recon.copy8x8 = vp8_copy_mem8x8_neon;
|
||||
rtcd->recon.copy8x4 = vp8_copy_mem8x4_neon;
|
||||
rtcd->recon.recon = vp8_recon_b_neon;
|
||||
rtcd->recon.recon2 = vp8_recon2b_neon;
|
||||
rtcd->recon.recon4 = vp8_recon4b_neon;
|
||||
rtcd->recon.recon_mb = vp8_recon_mb_neon;
|
||||
rtcd->recon.build_intra_predictors_mby =
|
||||
vp8_build_intra_predictors_mby_neon;
|
||||
rtcd->recon.build_intra_predictors_mby_s =
|
||||
|
@ -11,25 +11,27 @@
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
;void vp8_dc_only_idct_add_v6(short input_dc, unsigned char *pred_ptr,
|
||||
; unsigned char *dst_ptr, int pitch, int stride)
|
||||
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
|
||||
; int pred_stride, unsigned char *dst_ptr,
|
||||
; int dst_stride)
|
||||
; r0 input_dc
|
||||
; r1 pred_ptr
|
||||
; r2 dest_ptr
|
||||
; r3 pitch
|
||||
; sp stride
|
||||
; r2 pred_stride
|
||||
; r3 dst_ptr
|
||||
; sp dst_stride
|
||||
|
||||
|vp8_dc_only_idct_add_v6| PROC
|
||||
stmdb sp!, {r4 - r7, lr}
|
||||
stmdb sp!, {r4 - r7}
|
||||
|
||||
add r0, r0, #4 ; input_dc += 4
|
||||
ldr r12, c0x0000FFFF
|
||||
ldr r4, [r1], r3
|
||||
ldr r6, [r1], r3
|
||||
ldr r4, [r1], r2
|
||||
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
|
||||
ldr lr, [sp, #20]
|
||||
ldr r6, [r1], r2
|
||||
orr r0, r0, r0, lsl #16 ; a1 | a1
|
||||
|
||||
ldr r12, [sp, #16] ; dst stride
|
||||
|
||||
uxtab16 r5, r0, r4 ; a1+2 | a1+0
|
||||
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
|
||||
uxtab16 r7, r0, r6
|
||||
@ -40,10 +42,10 @@
|
||||
usat16 r6, #8, r6
|
||||
orr r5, r5, r4, lsl #8
|
||||
orr r7, r7, r6, lsl #8
|
||||
ldr r4, [r1], r3
|
||||
ldr r4, [r1], r2
|
||||
str r5, [r3], r12
|
||||
ldr r6, [r1]
|
||||
str r5, [r2], lr
|
||||
str r7, [r2], lr
|
||||
str r7, [r3], r12
|
||||
|
||||
uxtab16 r5, r0, r4
|
||||
uxtab16 r4, r0, r4, ror #8
|
||||
@ -55,10 +57,11 @@
|
||||
usat16 r6, #8, r6
|
||||
orr r5, r5, r4, lsl #8
|
||||
orr r7, r7, r6, lsl #8
|
||||
str r5, [r2], lr
|
||||
str r7, [r2]
|
||||
str r5, [r3], r12
|
||||
str r7, [r3]
|
||||
|
||||
ldmia sp!, {r4 - r7, pc}
|
||||
ldmia sp!, {r4 - r7}
|
||||
bx lr
|
||||
|
||||
ENDP ; |vp8_dc_only_idct_add_v6|
|
||||
|
||||
|
@ -9,337 +9,194 @@
|
||||
;
|
||||
|
||||
|
||||
; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
|
||||
EXPORT |vp8_short_idct4x4llm_1_v6|
|
||||
EXPORT |vp8_short_idct4x4llm_v6|
|
||||
EXPORT |vp8_short_idct4x4llm_v6_scott|
|
||||
EXPORT |vp8_short_idct4x4llm_v6_dual|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
;********************************************************************************
|
||||
;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
|
||||
;* r0 INT16 * input
|
||||
;* r1 INT16 * output
|
||||
;* r2 INT32 pitch
|
||||
;* bench: 3/5
|
||||
;********************************************************************************
|
||||
|
||||
|vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
|
||||
;
|
||||
ldrsh r0, [r0] ; load input[0] 1, r0 un 2
|
||||
add r0, r0, #4 ; 1 +4
|
||||
stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
|
||||
mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
|
||||
pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
|
||||
mov r5, r4 ; expand expand
|
||||
; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
|
||||
; unsigned char *dst, int stride)
|
||||
; r0 short* input
|
||||
; r1 unsigned char* pred
|
||||
; r2 int pitch
|
||||
; r3 unsigned char* dst
|
||||
; sp int stride
|
||||
|
||||
strd r4, [r1], r2 ; *output = r0, post inc 1
|
||||
strd r4, [r1], r2 ; 1
|
||||
strd r4, [r1], r2 ; 1
|
||||
strd r4, [r1] ; 1
|
||||
;
|
||||
ldmia sp!, {r4, r5, pc} ; replace vars, return restore
|
||||
ENDP ; |vp8_short_idct4x4llm_1_v6|
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
|vp8_short_idct4x4llm_v6_dual| PROC
|
||||
stmdb sp!, {r4-r11, lr}
|
||||
|
||||
;********************************************************************************
|
||||
;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
|
||||
;* r0 INT16 * input
|
||||
;* r1 INT16 * output
|
||||
;* r2 INT32 pitch
|
||||
;* bench:
|
||||
;********************************************************************************
|
||||
sub sp, sp, #4
|
||||
|
||||
|vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
|
||||
;
|
||||
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
|
||||
;
|
||||
mov r4, #0x00004E00 ; 1 cst
|
||||
orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
|
||||
mov r5, #0x00008A00 ; 1 cst
|
||||
orr r5, r5, #0x0000008C ; sinpi8sqrt2
|
||||
;
|
||||
mov r6, #4 ; i=4 1 i
|
||||
loop1 ;
|
||||
ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
|
||||
ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
|
||||
ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
|
||||
ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
|
||||
smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
|
||||
smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
|
||||
add r9, r7, r8 ; a1 = [0] + [8] 1 a1
|
||||
sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
|
||||
add r11, r3, r11 ; temp2 1
|
||||
rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
|
||||
smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
|
||||
smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
|
||||
add r8, r7, r11 ; b1 + c1 1 b+c
|
||||
strh r8, [r1, r2] ; out[pitch] = b1+c1 1
|
||||
sub r7, r7, r11 ; b1 - c1 1 b-c
|
||||
add r10, r12, r10 ; temp1 1
|
||||
add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
|
||||
add r10, r9, r3 ; a1 + d1 1 a+d
|
||||
sub r3, r9, r3 ; a1 - d1 1 a-d
|
||||
add r8, r2, r2 ; pitch * 2 1 p*2
|
||||
strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
|
||||
add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
|
||||
strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
|
||||
subs r6, r6, #1 ; i-- 1 --
|
||||
strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
|
||||
bne loop1 ; if i>0, continue
|
||||
;
|
||||
sub r1, r1, #8 ; set up out for next loop 1 -4
|
||||
; for this iteration, input=prev output
|
||||
mov r6, #4 ; i=4 1 i
|
||||
; b returnfull
|
||||
loop2 ;
|
||||
ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
|
||||
ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
|
||||
ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
|
||||
ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
|
||||
smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
|
||||
smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
|
||||
add r7, r0, r3 ; a1 = [0] + [2] 1 a1
|
||||
sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
|
||||
add r10, r8, r10 ; temp2 1
|
||||
rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
|
||||
smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
|
||||
smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
|
||||
add r3, r0, r9 ; b1+c1 1 b+c
|
||||
add r3, r3, #4 ; b1+c1+4 1 +4
|
||||
add r10, r11, r10 ; temp1 1
|
||||
mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
|
||||
strh r3, [r1, #2] ; out[1] = b1+c1 1
|
||||
add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
|
||||
add r3, r7, r10 ; a1+d1 1 a+d
|
||||
add r3, r3, #4 ; a1+d1+4 1 +4
|
||||
sub r7, r7, r10 ; a1-d1 1 a-d
|
||||
add r7, r7, #4 ; a1-d1+4 1 +4
|
||||
mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
|
||||
mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
|
||||
strh r7, [r1, #6] ; out[3] = a1-d1 1
|
||||
sub r0, r0, r9 ; b1-c1 1 b-c
|
||||
add r0, r0, #4 ; b1-c1+4 1 +4
|
||||
subs r6, r6, #1 ; i-- 1 --
|
||||
mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
|
||||
strh r0, [r1, #4] ; out[2] = b1-c1 1
|
||||
strh r3, [r1], r2 ; out[0] = a1+d1 1
|
||||
; add r1, r1, r2 ; out += pitch 1 ++
|
||||
bne loop2 ; if i>0, continue
|
||||
returnfull ;
|
||||
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
|
||||
ENDP
|
||||
mov r4, #0x00008A00 ; sin
|
||||
orr r4, r4, #0x0000008C ; sinpi8sqrt2
|
||||
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
mov r5, #0x00004E00 ; cos
|
||||
orr r5, r5, #0x0000007B ; cospi8sqrt2minus1
|
||||
orr r5, r5, #1<<31 ; loop counter on top bit
|
||||
|
||||
;********************************************************************************
|
||||
;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
|
||||
;* r0 INT16 * input
|
||||
;* r1 INT16 * output
|
||||
;* r2 INT32 pitch
|
||||
;* bench:
|
||||
;********************************************************************************
|
||||
|
||||
|vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
|
||||
; mov r0, #0 ;
|
||||
; ldr r0, [r0] ;
|
||||
stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
|
||||
;
|
||||
mov r3, #0x00004E00 ; cos
|
||||
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
|
||||
mov r4, #0x00008A00 ; sin
|
||||
orr r4, r4, #0x0000008C ; sinpi8sqrt2
|
||||
;
|
||||
mov r5, #0x2 ; i i
|
||||
;
|
||||
short_idct4x4llm_v6_scott_loop1 ;
|
||||
ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
|
||||
ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
|
||||
;
|
||||
smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
|
||||
smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
|
||||
;
|
||||
smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
|
||||
smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
|
||||
;
|
||||
add r6, r6, r7 ; partial c1 lt1-lt2
|
||||
add r12, r12, r14 ; partial d1 l2t2+l2t1
|
||||
;
|
||||
smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
|
||||
smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
|
||||
;
|
||||
smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
|
||||
smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
|
||||
;
|
||||
add r7, r14, r7 ; partial c1_2 ht1+ht2
|
||||
sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
|
||||
;
|
||||
pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
|
||||
pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
|
||||
;
|
||||
usub16 r6, r6, r10 ; c1_2 | c1_1 c
|
||||
uadd16 r12, r12, r11 ; d1_2 | d1_1 d
|
||||
;
|
||||
ldr r10, [r0, #0] ; i1 | i0 1,0
|
||||
ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
|
||||
;
|
||||
;;;;;; add r0, r0, #0x4 ; +4
|
||||
;;;;;; add r1, r1, #0x4 ; +4
|
||||
;
|
||||
uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
|
||||
usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
|
||||
;
|
||||
uadd16 r7, r8, r12 ; a1 + d1 pair a+d
|
||||
usub16 r14, r8, r12 ; a1 - d1 pair a-d
|
||||
;
|
||||
str r7, [r1] ; op[0] = a1 + d1
|
||||
str r14, [r1, r2] ; op[pitch*3] = a1 - d1
|
||||
;
|
||||
add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
|
||||
add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
|
||||
;
|
||||
subs r5, r5, #0x1 ; --
|
||||
bne short_idct4x4llm_v6_scott_loop1 ;
|
||||
;
|
||||
sub r1, r1, #16 ; reset output ptr
|
||||
mov r5, #0x4 ;
|
||||
mov r0, r1 ; input = output
|
||||
;
|
||||
short_idct4x4llm_v6_scott_loop2 ;
|
||||
;
|
||||
subs r5, r5, #0x1 ;
|
||||
bne short_idct4x4llm_v6_scott_loop2 ;
|
||||
;
|
||||
ldmia sp!, {r4 - r11, pc} ;
|
||||
ENDP ;
|
||||
;
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
;********************************************************************************
|
||||
|
||||
;********************************************************************************
|
||||
;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
|
||||
;* r0 INT16 * input
|
||||
;* r1 INT16 * output
|
||||
;* r2 INT32 pitch
|
||||
;* bench:
|
||||
;********************************************************************************
|
||||
|
||||
|vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
|
||||
;
|
||||
stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
|
||||
mov r3, #0x00004E00 ; cos
|
||||
orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
|
||||
mov r4, #0x00008A00 ; sin
|
||||
orr r4, r4, #0x0000008C ; sinpi8sqrt2
|
||||
mov r5, #0x2 ; i=2 i
|
||||
loop1_dual
|
||||
ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
|
||||
ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
|
||||
ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
|
||||
ldr r6, [r0, #(4*2)] ; i5 | i4
|
||||
ldr r12, [r0, #(12*2)] ; i13|i12
|
||||
ldr r14, [r0, #(8*2)] ; i9 | i8
|
||||
|
||||
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
|
||||
smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
|
||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
|
||||
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
|
||||
pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
|
||||
smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
|
||||
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
|
||||
smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16
|
||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
|
||||
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16
|
||||
|
||||
smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16
|
||||
pkhtb r7, r9, r7, asr #16 ; 5c | 4c
|
||||
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
|
||||
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
|
||||
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
|
||||
smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
|
||||
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
|
||||
subs r5, r5, #0x1 ; i-- --
|
||||
pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
|
||||
ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
|
||||
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
|
||||
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
|
||||
usub16 r7, r8, r7 ; c c
|
||||
uadd16 r6, r6, r10 ; d d
|
||||
uadd16 r10, r11, r14 ; a a
|
||||
usub16 r8, r11, r14 ; b b
|
||||
uadd16 r9, r10, r6 ; a+d a+d
|
||||
usub16 r10, r10, r6 ; a-d a-d
|
||||
uadd16 r6, r8, r7 ; b+c b+c
|
||||
usub16 r7, r8, r7 ; b-c b-c
|
||||
str r6, [r1, r2] ; o5 | o4
|
||||
add r6, r2, r2 ; pitch * 2 p2
|
||||
str r7, [r1, r6] ; o9 | o8
|
||||
add r6, r6, r2 ; pitch * 3 p3
|
||||
str r10, [r1, r6] ; o13 | o12
|
||||
str r9, [r1], #0x4 ; o1 | o0 ++
|
||||
bne loop1_dual ;
|
||||
mov r5, #0x2 ; i=2 i
|
||||
sub r0, r1, #8 ; reset input/output i/o
|
||||
loop2_dual
|
||||
ldr r6, [r0, r2] ; i5 | i4 5|4
|
||||
ldr r1, [r0] ; i1 | i0 1|0
|
||||
ldr r12, [r0, #0x4] ; i3 | i2 3|2
|
||||
add r14, r2, #0x4 ; pitch + 2 p+2
|
||||
ldr r14, [r0, r14] ; i7 | i6 7|6
|
||||
smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
|
||||
smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
|
||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
|
||||
smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
|
||||
pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
|
||||
pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
|
||||
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
|
||||
pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
|
||||
uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
|
||||
pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
|
||||
uadd16 r10, r11, r9 ; a a
|
||||
usub16 r9, r11, r9 ; b b
|
||||
pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
|
||||
subs r5, r5, #0x1 ; i-- --
|
||||
smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
|
||||
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
|
||||
smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
|
||||
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
|
||||
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
|
||||
|
||||
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16
|
||||
smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16
|
||||
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16
|
||||
|
||||
subs r5, r5, #1<<31 ; i--
|
||||
|
||||
pkhtb r9, r11, r9, asr #16 ; 13c | 12c
|
||||
ldr r11, [r0] ; i1 | i0
|
||||
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
|
||||
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
|
||||
|
||||
usub16 r7, r8, r7 ; c
|
||||
uadd16 r6, r6, r10 ; d
|
||||
uadd16 r10, r11, r14 ; a
|
||||
usub16 r8, r11, r14 ; b
|
||||
|
||||
uadd16 r9, r10, r6 ; a+d
|
||||
usub16 r10, r10, r6 ; a-d
|
||||
uadd16 r6, r8, r7 ; b+c
|
||||
usub16 r7, r8, r7 ; b-c
|
||||
|
||||
; use input buffer to store intermediate results
|
||||
str r6, [r0, #(4*2)] ; o5 | o4
|
||||
str r7, [r0, #(8*2)] ; o9 | o8
|
||||
str r10,[r0, #(12*2)] ; o13|o12
|
||||
str r9, [r0], #4 ; o1 | o0
|
||||
|
||||
bcs loop1_dual
|
||||
|
||||
sub r0, r0, #8 ; reset input/output
|
||||
str r0, [sp]
|
||||
|
||||
loop2_dual
|
||||
|
||||
ldr r6, [r0, #(4*2)] ; i5 | i4
|
||||
ldr r12,[r0, #(2*2)] ; i3 | i2
|
||||
ldr r14,[r0, #(6*2)] ; i7 | i6
|
||||
ldr r0, [r0, #(0*2)] ; i1 | i0
|
||||
|
||||
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
|
||||
smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16
|
||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
|
||||
smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16
|
||||
|
||||
pkhbt r11, r6, r0, lsl #16 ; i0 | i4
|
||||
pkhtb r7, r7, r9, asr #16 ; 1c | 5c
|
||||
pkhtb r0, r0, r6, asr #16 ; i1 | i5
|
||||
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1
|
||||
|
||||
uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2
|
||||
pkhbt r9, r14, r12, lsl #16 ; i2 | i6
|
||||
uadd16 r10, r11, r9 ; a
|
||||
usub16 r9, r11, r9 ; b
|
||||
pkhtb r6, r12, r14, asr #16 ; i3 | i7
|
||||
|
||||
subs r5, r5, #1<<31 ; i--
|
||||
|
||||
smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16
|
||||
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16
|
||||
smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16
|
||||
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16
|
||||
|
||||
pkhtb r7, r7, r12, asr #16 ; 3c | 7c
|
||||
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1
|
||||
|
||||
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2
|
||||
usub16 r12, r8, r6 ; c (o1 | o5)
|
||||
uadd16 r6, r11, r0 ; d (o3 | o7)
|
||||
uadd16 r7, r10, r6 ; a+d
|
||||
|
||||
mov r8, #4 ; set up 4's
|
||||
orr r8, r8, #0x40000 ; 4|4
|
||||
|
||||
usub16 r6, r10, r6 ; a-d
|
||||
uadd16 r6, r6, r8 ; a-d+4, 3|7
|
||||
uadd16 r7, r7, r8 ; a+d+4, 0|4
|
||||
uadd16 r10, r9, r12 ; b+c
|
||||
usub16 r0, r9, r12 ; b-c
|
||||
uadd16 r10, r10, r8 ; b+c+4, 1|5
|
||||
uadd16 r8, r0, r8 ; b-c+4, 2|6
|
||||
|
||||
ldr lr, [sp, #40] ; dst stride
|
||||
|
||||
ldrb r0, [r1] ; pred p0
|
||||
ldrb r11, [r1, #1] ; pred p1
|
||||
ldrb r12, [r1, #2] ; pred p2
|
||||
|
||||
add r0, r0, r7, asr #19 ; p0 + o0
|
||||
add r11, r11, r10, asr #19 ; p1 + o1
|
||||
add r12, r12, r8, asr #19 ; p2 + o2
|
||||
|
||||
usat r0, #8, r0 ; d0 = clip8(p0 + o0)
|
||||
usat r11, #8, r11 ; d1 = clip8(p1 + o1)
|
||||
usat r12, #8, r12 ; d2 = clip8(p2 + o2)
|
||||
|
||||
add r0, r0, r11, lsl #8 ; |--|--|d1|d0|
|
||||
|
||||
ldrb r11, [r1, #3] ; pred p3
|
||||
|
||||
add r0, r0, r12, lsl #16 ; |--|d2|d1|d0|
|
||||
|
||||
add r11, r11, r6, asr #19 ; p3 + o3
|
||||
|
||||
sxth r7, r7 ;
|
||||
sxth r10, r10 ;
|
||||
|
||||
usat r11, #8, r11 ; d3 = clip8(p3 + o3)
|
||||
|
||||
sxth r8, r8 ;
|
||||
sxth r6, r6 ;
|
||||
|
||||
add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0|
|
||||
|
||||
ldrb r12, [r1, r2]! ; pred p4
|
||||
str r0, [r3], lr
|
||||
ldrb r11, [r1, #1] ; pred p5
|
||||
|
||||
add r12, r12, r7, asr #3 ; p4 + o4
|
||||
add r11, r11, r10, asr #3 ; p5 + o5
|
||||
|
||||
usat r12, #8, r12 ; d4 = clip8(p4 + o4)
|
||||
usat r11, #8, r11 ; d5 = clip8(p5 + o5)
|
||||
|
||||
ldrb r7, [r1, #2] ; pred p6
|
||||
ldrb r10, [r1, #3] ; pred p6
|
||||
|
||||
add r12, r12, r11, lsl #8 ; |--|--|d5|d4|
|
||||
|
||||
add r7, r7, r8, asr #3 ; p6 + o6
|
||||
add r10, r10, r6, asr #3 ; p7 + o7
|
||||
|
||||
ldr r0, [sp] ; load input pointer
|
||||
|
||||
usat r7, #8, r7 ; d6 = clip8(p6 + o6)
|
||||
usat r10, #8, r10 ; d7 = clip8(p7 + o7)
|
||||
|
||||
add r12, r12, r7, lsl #16 ; |--|d6|d5|d4|
|
||||
add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4|
|
||||
|
||||
str r12, [r3], lr
|
||||
add r0, r0, #16
|
||||
add r1, r1, r2 ; pred + pitch
|
||||
|
||||
bcs loop2_dual
|
||||
|
||||
add sp, sp, #4 ; idct_output buffer
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
|
||||
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
|
||||
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
|
||||
usub16 r12, r8, r6 ; c (o1 | o5) c
|
||||
uadd16 r6, r11, r1 ; d (o3 | o7) d
|
||||
uadd16 r7, r10, r6 ; a+d a+d
|
||||
mov r8, #0x4 ; set up 4's 4
|
||||
orr r8, r8, #0x40000 ; 4|4
|
||||
usub16 r6, r10, r6 ; a-d a-d
|
||||
uadd16 r6, r6, r8 ; a-d+4 3|7
|
||||
uadd16 r7, r7, r8 ; a+d+4 0|4
|
||||
uadd16 r10, r9, r12 ; b+c b+c
|
||||
usub16 r1, r9, r12 ; b-c b-c
|
||||
uadd16 r10, r10, r8 ; b+c+4 1|5
|
||||
uadd16 r1, r1, r8 ; b-c+4 2|6
|
||||
mov r8, r10, asr #19 ; o1 >> 3
|
||||
strh r8, [r0, #2] ; o1
|
||||
mov r8, r1, asr #19 ; o2 >> 3
|
||||
strh r8, [r0, #4] ; o2
|
||||
mov r8, r6, asr #19 ; o3 >> 3
|
||||
strh r8, [r0, #6] ; o3
|
||||
mov r8, r7, asr #19 ; o0 >> 3
|
||||
strh r8, [r0], r2 ; o0 +p
|
||||
sxth r10, r10 ;
|
||||
mov r8, r10, asr #3 ; o5 >> 3
|
||||
strh r8, [r0, #2] ; o5
|
||||
sxth r1, r1 ;
|
||||
mov r8, r1, asr #3 ; o6 >> 3
|
||||
strh r8, [r0, #4] ; o6
|
||||
sxth r6, r6 ;
|
||||
mov r8, r6, asr #3 ; o7 >> 3
|
||||
strh r8, [r0, #6] ; o7
|
||||
sxth r7, r7 ;
|
||||
mov r8, r7, asr #3 ; o4 >> 3
|
||||
strh r8, [r0], r2 ; o4 +p
|
||||
;;;;; subs r5, r5, #0x1 ; i-- --
|
||||
bne loop2_dual ;
|
||||
;
|
||||
ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
@ -1,281 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon_b_armv6|
|
||||
EXPORT |vp8_recon2b_armv6|
|
||||
EXPORT |vp8_recon4b_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
prd RN r0
|
||||
dif RN r1
|
||||
dst RN r2
|
||||
stride RN r3
|
||||
|
||||
;void recon_b(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int stride)
|
||||
; R0 char* pred_ptr
|
||||
; R1 short * dif_ptr
|
||||
; R2 char * dst_ptr
|
||||
; R3 int stride
|
||||
|
||||
; Description:
|
||||
; Loop through the block adding the Pred and Diff together. Clamp and then
|
||||
; store back into the Dst.
|
||||
|
||||
; Restrictions :
|
||||
; all buffers are expected to be 4 byte aligned coming in and
|
||||
; going out.
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
;
|
||||
;
|
||||
;
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_recon_b_armv6| PROC
|
||||
stmdb sp!, {r4 - r9, lr}
|
||||
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
add dif, dif, #32
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst], stride
|
||||
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
|
||||
;; ldr r6, [dif, #8] ; 1 | 0
|
||||
;; ldr r7, [dif, #12] ; 3 | 2
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
add dif, dif, #32
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst], stride
|
||||
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
|
||||
;; ldr r6, [dif, #16] ; 1 | 0
|
||||
;; ldr r7, [dif, #20] ; 3 | 2
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
add dif, dif, #32
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst], stride
|
||||
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #16 ; 3 | 2 | 1 | 0
|
||||
;; ldr r6, [dif, #24] ; 1 | 0
|
||||
;; ldr r7, [dif, #28] ; 3 | 2
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst], stride
|
||||
|
||||
ldmia sp!, {r4 - r9, pc}
|
||||
|
||||
ENDP ; |recon_b|
|
||||
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
;
|
||||
;
|
||||
;
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
; R0 char *pred_ptr
|
||||
; R1 short *dif_ptr
|
||||
; R2 char *dst_ptr
|
||||
; R3 int stride
|
||||
|vp8_recon4b_armv6| PROC
|
||||
stmdb sp!, {r4 - r9, lr}
|
||||
|
||||
mov lr, #4
|
||||
|
||||
recon4b_loop
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #4 ; 3 | 2 | 1 | 0
|
||||
ldr r6, [dif, #0] ; 1 | 0
|
||||
ldr r7, [dif, #4] ; 3 | 2
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; 2 | 0
|
||||
pkhtb r9, r7, r6, asr #16 ; 3 | 1
|
||||
|
||||
uxtab16 r8, r8, r4 ; 2 | 0 + 3 | 2 | 2 | 0
|
||||
uxtab16 r9, r9, r4, ror #8 ; 3 | 1 + 0 | 3 | 2 | 1
|
||||
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst]
|
||||
|
||||
;4, 5, 6, 7
|
||||
ldr r4, [prd], #4
|
||||
;; ldr r6, [dif, #32]
|
||||
;; ldr r7, [dif, #36]
|
||||
ldr r6, [dif, #8]
|
||||
ldr r7, [dif, #12]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst, #4]
|
||||
|
||||
;8, 9, 10, 11
|
||||
ldr r4, [prd], #4
|
||||
;; ldr r6, [dif, #64]
|
||||
;; ldr r7, [dif, #68]
|
||||
ldr r6, [dif, #16]
|
||||
ldr r7, [dif, #20]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst, #8]
|
||||
|
||||
;12, 13, 14, 15
|
||||
ldr r4, [prd], #4
|
||||
;; ldr r6, [dif, #96]
|
||||
;; ldr r7, [dif, #100]
|
||||
ldr r6, [dif, #24]
|
||||
ldr r7, [dif, #28]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst, #12]
|
||||
|
||||
add dst, dst, stride
|
||||
;; add dif, dif, #8
|
||||
add dif, dif, #32
|
||||
|
||||
subs lr, lr, #1
|
||||
bne recon4b_loop
|
||||
|
||||
ldmia sp!, {r4 - r9, pc}
|
||||
|
||||
ENDP ; |Recon4B|
|
||||
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
;
|
||||
;
|
||||
;
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
; R0 char *pred_ptr
|
||||
; R1 short *dif_ptr
|
||||
; R2 char *dst_ptr
|
||||
; R3 int stride
|
||||
|vp8_recon2b_armv6| PROC
|
||||
stmdb sp!, {r4 - r9, lr}
|
||||
|
||||
mov lr, #4
|
||||
|
||||
recon2b_loop
|
||||
;0, 1, 2, 3
|
||||
ldr r4, [prd], #4
|
||||
ldr r6, [dif, #0]
|
||||
ldr r7, [dif, #4]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst]
|
||||
|
||||
;4, 5, 6, 7
|
||||
ldr r4, [prd], #4
|
||||
;; ldr r6, [dif, #32]
|
||||
;; ldr r7, [dif, #36]
|
||||
ldr r6, [dif, #8]
|
||||
ldr r7, [dif, #12]
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16
|
||||
pkhtb r9, r7, r6, asr #16
|
||||
|
||||
uxtab16 r8, r8, r4
|
||||
uxtab16 r9, r9, r4, ror #8
|
||||
usat16 r8, #8, r8
|
||||
usat16 r9, #8, r9
|
||||
orr r8, r8, r9, lsl #8
|
||||
|
||||
str r8, [dst, #4]
|
||||
|
||||
add dst, dst, stride
|
||||
;; add dif, dif, #8
|
||||
add dif, dif, #16
|
||||
|
||||
subs lr, lr, #1
|
||||
bne recon2b_loop
|
||||
|
||||
ldmia sp!, {r4 - r9, pc}
|
||||
|
||||
ENDP ; |Recon2B|
|
||||
|
||||
END
|
@ -13,16 +13,12 @@
|
||||
#define IDCT_ARM_H
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_idct(vp8_short_idct4x4llm_1_v6);
|
||||
extern prototype_idct(vp8_short_idct4x4llm_v6_dual);
|
||||
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_v6);
|
||||
extern prototype_second_order(vp8_short_inv_walsh4x4_1_v6);
|
||||
extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_idct_idct1
|
||||
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_v6
|
||||
|
||||
#undef vp8_idct_idct16
|
||||
#define vp8_idct_idct16 vp8_short_idct4x4llm_v6_dual
|
||||
|
||||
@ -38,16 +34,12 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_v6);
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern prototype_idct(vp8_short_idct4x4llm_1_neon);
|
||||
extern prototype_idct(vp8_short_idct4x4llm_neon);
|
||||
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_neon);
|
||||
extern prototype_second_order(vp8_short_inv_walsh4x4_1_neon);
|
||||
extern prototype_second_order(vp8_short_inv_walsh4x4_neon);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_idct_idct1
|
||||
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_neon
|
||||
|
||||
#undef vp8_idct_idct16
|
||||
#define vp8_idct_idct16 vp8_short_idct4x4llm_neon
|
||||
|
||||
|
@ -14,22 +14,26 @@
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void vp8_dc_only_idct_add_neon(short input_dc, unsigned char *pred_ptr,
|
||||
; unsigned char *dst_ptr, int pitch, int stride)
|
||||
|
||||
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
|
||||
; int pred_stride, unsigned char *dst_ptr,
|
||||
; int dst_stride)
|
||||
|
||||
; r0 input_dc
|
||||
; r1 pred_ptr
|
||||
; r2 dst_ptr
|
||||
; r3 pitch
|
||||
; sp stride
|
||||
; r2 pred_stride
|
||||
; r3 dst_ptr
|
||||
; sp dst_stride
|
||||
|
||||
|vp8_dc_only_idct_add_neon| PROC
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
ldr r12, [sp]
|
||||
vdup.16 q0, r0
|
||||
|
||||
vld1.32 {d2[0]}, [r1], r3
|
||||
vld1.32 {d2[1]}, [r1], r3
|
||||
vld1.32 {d4[0]}, [r1], r3
|
||||
vld1.32 {d2[0]}, [r1], r2
|
||||
vld1.32 {d2[1]}, [r1], r2
|
||||
vld1.32 {d4[0]}, [r1], r2
|
||||
vld1.32 {d4[1]}, [r1]
|
||||
|
||||
vaddw.u8 q1, q0, d2
|
||||
@ -38,12 +42,13 @@
|
||||
vqmovun.s16 d2, q1
|
||||
vqmovun.s16 d4, q2
|
||||
|
||||
vst1.32 {d2[0]}, [r2], r12
|
||||
vst1.32 {d2[1]}, [r2], r12
|
||||
vst1.32 {d4[0]}, [r2], r12
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
|
||||
bx lr
|
||||
vst1.32 {d2[0]}, [r3], r12
|
||||
vst1.32 {d2[1]}, [r3], r12
|
||||
vst1.32 {d4[0]}, [r3], r12
|
||||
vst1.32 {d4[1]}, [r3]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
@ -1,131 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon16x16mb_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *pred_ptr,
|
||||
; r1 short *diff_ptr,
|
||||
; r2 unsigned char *dst_ptr,
|
||||
; r3 int ystride,
|
||||
; stack unsigned char *udst_ptr,
|
||||
; stack unsigned char *vdst_ptr
|
||||
|
||||
|vp8_recon16x16mb_neon| PROC
|
||||
mov r12, #4 ;loop counter for Y loop
|
||||
|
||||
recon16x16mb_loop_y
|
||||
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
|
||||
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
|
||||
vld1.u8 {q14, q15}, [r0]!
|
||||
vld1.16 {q10, q11}, [r1]!
|
||||
|
||||
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
|
||||
vmovl.u8 q1, d25
|
||||
vmovl.u8 q2, d26
|
||||
vmovl.u8 q3, d27
|
||||
vmovl.u8 q4, d28
|
||||
vmovl.u8 q5, d29
|
||||
vmovl.u8 q6, d30
|
||||
vld1.16 {q12, q13}, [r1]!
|
||||
vmovl.u8 q7, d31
|
||||
vld1.16 {q14, q15}, [r1]!
|
||||
|
||||
pld [r0]
|
||||
pld [r1]
|
||||
pld [r1, #64]
|
||||
|
||||
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
|
||||
vadd.s16 q1, q1, q9
|
||||
vadd.s16 q2, q2, q10
|
||||
vadd.s16 q3, q3, q11
|
||||
vadd.s16 q4, q4, q12
|
||||
vadd.s16 q5, q5, q13
|
||||
vadd.s16 q6, q6, q14
|
||||
vadd.s16 q7, q7, q15
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
vqmovun.s16 d4, q4
|
||||
vqmovun.s16 d5, q5
|
||||
vst1.u8 {q0}, [r2], r3 ;store result
|
||||
vqmovun.s16 d6, q6
|
||||
vst1.u8 {q1}, [r2], r3
|
||||
vqmovun.s16 d7, q7
|
||||
vst1.u8 {q2}, [r2], r3
|
||||
subs r12, r12, #1
|
||||
|
||||
moveq r12, #2 ;loop counter for UV loop
|
||||
|
||||
vst1.u8 {q3}, [r2], r3
|
||||
bne recon16x16mb_loop_y
|
||||
|
||||
mov r3, r3, lsr #1 ;uv_stride = ystride>>1
|
||||
ldr r2, [sp] ;load upred_ptr
|
||||
|
||||
recon16x16mb_loop_uv
|
||||
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
|
||||
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
|
||||
vld1.u8 {q14, q15}, [r0]!
|
||||
vld1.16 {q10, q11}, [r1]!
|
||||
|
||||
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
|
||||
vmovl.u8 q1, d25
|
||||
vmovl.u8 q2, d26
|
||||
vmovl.u8 q3, d27
|
||||
vmovl.u8 q4, d28
|
||||
vmovl.u8 q5, d29
|
||||
vmovl.u8 q6, d30
|
||||
vld1.16 {q12, q13}, [r1]!
|
||||
vmovl.u8 q7, d31
|
||||
vld1.16 {q14, q15}, [r1]!
|
||||
|
||||
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
|
||||
vadd.s16 q1, q1, q9
|
||||
vadd.s16 q2, q2, q10
|
||||
vadd.s16 q3, q3, q11
|
||||
vadd.s16 q4, q4, q12
|
||||
vadd.s16 q5, q5, q13
|
||||
vadd.s16 q6, q6, q14
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vadd.s16 q7, q7, q15
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
vst1.u8 {d0}, [r2], r3 ;store result
|
||||
vqmovun.s16 d4, q4
|
||||
vst1.u8 {d1}, [r2], r3
|
||||
vqmovun.s16 d5, q5
|
||||
vst1.u8 {d2}, [r2], r3
|
||||
vqmovun.s16 d6, q6
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
vqmovun.s16 d7, q7
|
||||
vst1.u8 {d4}, [r2], r3
|
||||
subs r12, r12, #1
|
||||
|
||||
vst1.u8 {d5}, [r2], r3
|
||||
vst1.u8 {d6}, [r2], r3
|
||||
vst1.u8 {d7}, [r2], r3
|
||||
|
||||
ldrne r2, [sp, #4] ;load vpred_ptr
|
||||
bne recon16x16mb_loop_uv
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
@ -1,54 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon2b_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *pred_ptr,
|
||||
; r1 short *diff_ptr,
|
||||
; r2 unsigned char *dst_ptr,
|
||||
; r3 int stride
|
||||
|
||||
|vp8_recon2b_neon| PROC
|
||||
vld1.u8 {q8, q9}, [r0] ;load data from pred_ptr
|
||||
vld1.16 {q4, q5}, [r1]! ;load data from diff_ptr
|
||||
|
||||
vmovl.u8 q0, d16 ;modify Pred data from 8 bits to 16 bits
|
||||
vld1.16 {q6, q7}, [r1]!
|
||||
vmovl.u8 q1, d17
|
||||
vmovl.u8 q2, d18
|
||||
vmovl.u8 q3, d19
|
||||
|
||||
vadd.s16 q0, q0, q4 ;add Diff data and Pred data together
|
||||
vadd.s16 q1, q1, q5
|
||||
vadd.s16 q2, q2, q6
|
||||
vadd.s16 q3, q3, q7
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
add r0, r2, r3
|
||||
|
||||
vst1.u8 {d0}, [r2] ;store result
|
||||
vst1.u8 {d1}, [r0], r3
|
||||
add r2, r0, r3
|
||||
vst1.u8 {d2}, [r0]
|
||||
vst1.u8 {d3}, [r2], r3
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
@ -1,69 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon4b_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *pred_ptr,
|
||||
; r1 short *diff_ptr,
|
||||
; r2 unsigned char *dst_ptr,
|
||||
; r3 int stride
|
||||
|
||||
|vp8_recon4b_neon| PROC
|
||||
vld1.u8 {q12, q13}, [r0]! ;load data from pred_ptr
|
||||
vld1.16 {q8, q9}, [r1]! ;load data from diff_ptr
|
||||
vld1.u8 {q14, q15}, [r0]
|
||||
vld1.16 {q10, q11}, [r1]!
|
||||
|
||||
vmovl.u8 q0, d24 ;modify Pred data from 8 bits to 16 bits
|
||||
vmovl.u8 q1, d25
|
||||
vmovl.u8 q2, d26
|
||||
vmovl.u8 q3, d27
|
||||
vmovl.u8 q4, d28
|
||||
vmovl.u8 q5, d29
|
||||
vmovl.u8 q6, d30
|
||||
vld1.16 {q12, q13}, [r1]!
|
||||
vmovl.u8 q7, d31
|
||||
vld1.16 {q14, q15}, [r1]
|
||||
|
||||
vadd.s16 q0, q0, q8 ;add Diff data and Pred data together
|
||||
vadd.s16 q1, q1, q9
|
||||
vadd.s16 q2, q2, q10
|
||||
vadd.s16 q3, q3, q11
|
||||
vadd.s16 q4, q4, q12
|
||||
vadd.s16 q5, q5, q13
|
||||
vadd.s16 q6, q6, q14
|
||||
vadd.s16 q7, q7, q15
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
vqmovun.s16 d4, q4
|
||||
vqmovun.s16 d5, q5
|
||||
vqmovun.s16 d6, q6
|
||||
vqmovun.s16 d7, q7
|
||||
add r0, r2, r3
|
||||
|
||||
vst1.u8 {q0}, [r2] ;store result
|
||||
vst1.u8 {q1}, [r0], r3
|
||||
add r2, r0, r3
|
||||
vst1.u8 {q2}, [r0]
|
||||
vst1.u8 {q3}, [r2], r3
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
@ -1,29 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8/common/recon.h"
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
|
||||
|
||||
void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
|
||||
{
|
||||
unsigned char *pred_ptr = &x->predictor[0];
|
||||
short *diff_ptr = &x->diff[0];
|
||||
unsigned char *dst_ptr = x->dst.y_buffer;
|
||||
unsigned char *udst_ptr = x->dst.u_buffer;
|
||||
unsigned char *vdst_ptr = x->dst.v_buffer;
|
||||
int ystride = x->dst.y_stride;
|
||||
/*int uv_stride = x->dst.uv_stride;*/
|
||||
|
||||
vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr);
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_recon_b_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *pred_ptr,
|
||||
; r1 short *diff_ptr,
|
||||
; r2 unsigned char *dst_ptr,
|
||||
; r3 int stride
|
||||
|
||||
|vp8_recon_b_neon| PROC
|
||||
mov r12, #16
|
||||
|
||||
vld1.u8 {d28}, [r0], r12 ;load 4 data/line from pred_ptr
|
||||
vld1.16 {q10, q11}, [r1]! ;load data from diff_ptr
|
||||
vld1.u8 {d29}, [r0], r12
|
||||
vld1.16 {q11, q12}, [r1]!
|
||||
vld1.u8 {d30}, [r0], r12
|
||||
vld1.16 {q12, q13}, [r1]!
|
||||
vld1.u8 {d31}, [r0], r12
|
||||
vld1.16 {q13}, [r1]
|
||||
|
||||
vmovl.u8 q0, d28 ;modify Pred data from 8 bits to 16 bits
|
||||
vmovl.u8 q1, d29 ;Pred data in d0, d2, d4, d6
|
||||
vmovl.u8 q2, d30
|
||||
vmovl.u8 q3, d31
|
||||
|
||||
vadd.s16 d0, d0, d20 ;add Diff data and Pred data together
|
||||
vadd.s16 d2, d2, d22
|
||||
vadd.s16 d4, d4, d24
|
||||
vadd.s16 d6, d6, d26
|
||||
|
||||
vqmovun.s16 d0, q0 ;CLAMP() saturation
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
vqmovun.s16 d3, q3
|
||||
add r1, r2, r3
|
||||
|
||||
vst1.32 {d0[0]}, [r2] ;store result
|
||||
vst1.32 {d1[0]}, [r1], r3
|
||||
add r2, r1, r3
|
||||
vst1.32 {d2[0]}, [r1]
|
||||
vst1.32 {d3[0]}, [r2], r3
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
@ -1,67 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_idct4x4llm_1_neon|
|
||||
EXPORT |vp8_dc_only_idct_neon|
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
|
||||
; r0 short *input;
|
||||
; r1 short *output;
|
||||
; r2 int pitch;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|vp8_short_idct4x4llm_1_neon| PROC
|
||||
vld1.16 {d0[]}, [r0] ;load input[0]
|
||||
|
||||
add r3, r1, r2
|
||||
add r12, r3, r2
|
||||
|
||||
vrshr.s16 d0, d0, #3
|
||||
|
||||
add r0, r12, r2
|
||||
|
||||
vst1.16 {d0}, [r1]
|
||||
vst1.16 {d0}, [r3]
|
||||
vst1.16 {d0}, [r12]
|
||||
vst1.16 {d0}, [r0]
|
||||
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
|
||||
; r0 short input_dc;
|
||||
; r1 short *output;
|
||||
; r2 int pitch;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|vp8_dc_only_idct_neon| PROC
|
||||
vdup.16 d0, r0
|
||||
|
||||
add r3, r1, r2
|
||||
add r12, r3, r2
|
||||
|
||||
vrshr.s16 d0, d0, #3
|
||||
|
||||
add r0, r12, r2
|
||||
|
||||
vst1.16 {d0}, [r1]
|
||||
vst1.16 {d0}, [r3]
|
||||
vst1.16 {d0}, [r12]
|
||||
vst1.16 {d0}, [r0]
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
END
|
@ -17,18 +17,24 @@
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
;*************************************************************
|
||||
;void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
|
||||
;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
|
||||
; unsigned char *dst, int stride)
|
||||
;r0 short * input
|
||||
;r1 short * output
|
||||
;r1 short * pred
|
||||
;r2 int pitch
|
||||
;r3 unsigned char dst
|
||||
;sp int stride
|
||||
;*************************************************************
|
||||
;static const int cospi8sqrt2minus1=20091;
|
||||
;static const int sinpi8sqrt2 =35468;
|
||||
;static const int rounding = 0;
|
||||
;Optimization note: The resulted data from dequantization are signed 13-bit data that is
|
||||
;in the range of [-4096, 4095]. This allows to use "vqdmulh"(neon) instruction since
|
||||
;it won't go out of range (13+16+1=30bits<32bits). This instruction gives the high half
|
||||
;result of the multiplication that is needed in IDCT.
|
||||
|
||||
; static const int cospi8sqrt2minus1=20091;
|
||||
; static const int sinpi8sqrt2 =35468;
|
||||
; static const int rounding = 0;
|
||||
|
||||
; Optimization note: The resulted data from dequantization are signed
|
||||
; 13-bit data that is in the range of [-4096, 4095]. This allows to
|
||||
; use "vqdmulh"(neon) instruction since it won't go out of range
|
||||
; (13+16+1=30bits<32bits). This instruction gives the high half
|
||||
; result of the multiplication that is needed in IDCT.
|
||||
|
||||
|vp8_short_idct4x4llm_neon| PROC
|
||||
adr r12, idct_coeff
|
||||
@ -36,6 +42,7 @@
|
||||
vld1.16 {d0}, [r12]
|
||||
|
||||
vswp d3, d4 ;q2(vp[4] vp[12])
|
||||
ldr r0, [sp] ; stride
|
||||
|
||||
vqdmulh.s16 q3, q2, d0[2]
|
||||
vqdmulh.s16 q4, q2, d0[0]
|
||||
@ -94,21 +101,31 @@
|
||||
vrshr.s16 d4, d4, #3
|
||||
vrshr.s16 d5, d5, #3
|
||||
|
||||
add r3, r1, r2
|
||||
add r12, r3, r2
|
||||
add r0, r12, r2
|
||||
|
||||
vtrn.32 d2, d4
|
||||
vtrn.32 d3, d5
|
||||
vtrn.16 d2, d3
|
||||
vtrn.16 d4, d5
|
||||
|
||||
vst1.16 {d2}, [r1]
|
||||
vst1.16 {d3}, [r3]
|
||||
vst1.16 {d4}, [r12]
|
||||
vst1.16 {d5}, [r0]
|
||||
; load prediction data
|
||||
vld1.32 d6[0], [r1], r2
|
||||
vld1.32 d6[1], [r1], r2
|
||||
vld1.32 d7[0], [r1], r2
|
||||
vld1.32 d7[1], [r1], r2
|
||||
|
||||
bx lr
|
||||
; add prediction and residual
|
||||
vaddw.u8 q1, q1, d6
|
||||
vaddw.u8 q2, q2, d7
|
||||
|
||||
vqmovun.s16 d1, q1
|
||||
vqmovun.s16 d2, q2
|
||||
|
||||
; store to destination
|
||||
vst1.32 d1[0], [r3], r0
|
||||
vst1.32 d1[1], [r3], r0
|
||||
vst1.32 d2[0], [r3], r0
|
||||
vst1.32 d2[1], [r3], r0
|
||||
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|
@ -13,24 +13,12 @@
|
||||
#define RECON_ARM_H
|
||||
|
||||
#if HAVE_ARMV6
|
||||
extern prototype_recon_block(vp8_recon_b_armv6);
|
||||
extern prototype_recon_block(vp8_recon2b_armv6);
|
||||
extern prototype_recon_block(vp8_recon4b_armv6);
|
||||
|
||||
extern prototype_copy_block(vp8_copy_mem8x8_v6);
|
||||
extern prototype_copy_block(vp8_copy_mem8x4_v6);
|
||||
extern prototype_copy_block(vp8_copy_mem16x16_v6);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_recon_recon
|
||||
#define vp8_recon_recon vp8_recon_b_armv6
|
||||
|
||||
#undef vp8_recon_recon2
|
||||
#define vp8_recon_recon2 vp8_recon2b_armv6
|
||||
|
||||
#undef vp8_recon_recon4
|
||||
#define vp8_recon_recon4 vp8_recon4b_armv6
|
||||
|
||||
#undef vp8_recon_copy8x8
|
||||
#define vp8_recon_copy8x8 vp8_copy_mem8x8_v6
|
||||
|
||||
@ -43,29 +31,15 @@ extern prototype_copy_block(vp8_copy_mem16x16_v6);
|
||||
#endif
|
||||
|
||||
#if HAVE_ARMV7
|
||||
extern prototype_recon_block(vp8_recon_b_neon);
|
||||
extern prototype_recon_block(vp8_recon2b_neon);
|
||||
extern prototype_recon_block(vp8_recon4b_neon);
|
||||
|
||||
extern prototype_copy_block(vp8_copy_mem8x8_neon);
|
||||
extern prototype_copy_block(vp8_copy_mem8x4_neon);
|
||||
extern prototype_copy_block(vp8_copy_mem16x16_neon);
|
||||
|
||||
extern prototype_recon_macroblock(vp8_recon_mb_neon);
|
||||
|
||||
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_neon);
|
||||
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_recon_recon
|
||||
#define vp8_recon_recon vp8_recon_b_neon
|
||||
|
||||
#undef vp8_recon_recon2
|
||||
#define vp8_recon_recon2 vp8_recon2b_neon
|
||||
|
||||
#undef vp8_recon_recon4
|
||||
#define vp8_recon_recon4 vp8_recon4b_neon
|
||||
|
||||
#undef vp8_recon_copy8x8
|
||||
#define vp8_recon_copy8x8 vp8_copy_mem8x8_neon
|
||||
|
||||
@ -75,9 +49,6 @@ extern prototype_build_intra_predictors(vp8_build_intra_predictors_mby_s_neon);
|
||||
#undef vp8_recon_copy16x16
|
||||
#define vp8_recon_copy16x16 vp8_copy_mem16x16_neon
|
||||
|
||||
#undef vp8_recon_recon_mb
|
||||
#define vp8_recon_recon_mb vp8_recon_mb_neon
|
||||
|
||||
#undef vp8_recon_build_intra_predictors_mby
|
||||
#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby_neon
|
||||
|
||||
|
@ -70,7 +70,6 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
|
||||
#if CONFIG_RUNTIME_CPU_DETECT
|
||||
VP8_COMMON_RTCD *rtcd = &ctx->rtcd;
|
||||
|
||||
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c;
|
||||
rtcd->idct.idct16 = vp8_short_idct4x4llm_c;
|
||||
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c;
|
||||
rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c;
|
||||
@ -79,11 +78,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx)
|
||||
rtcd->recon.copy16x16 = vp8_copy_mem16x16_c;
|
||||
rtcd->recon.copy8x8 = vp8_copy_mem8x8_c;
|
||||
rtcd->recon.copy8x4 = vp8_copy_mem8x4_c;
|
||||
rtcd->recon.recon = vp8_recon_b_c;
|
||||
rtcd->recon.recon2 = vp8_recon2b_c;
|
||||
rtcd->recon.recon4 = vp8_recon4b_c;
|
||||
rtcd->recon.recon_mb = vp8_recon_mb_c;
|
||||
rtcd->recon.recon_mby = vp8_recon_mby_c;
|
||||
|
||||
rtcd->recon.build_intra_predictors_mby =
|
||||
vp8_build_intra_predictors_mby;
|
||||
rtcd->recon.build_intra_predictors_mby_s =
|
||||
|
@ -16,12 +16,14 @@
|
||||
void sym(short *input, short *output)
|
||||
|
||||
#define prototype_idct(sym) \
|
||||
void sym(short *input, short *output, int pitch)
|
||||
void sym(short *input, unsigned char *pred, int pitch, unsigned char *dst, \
|
||||
int dst_stride)
|
||||
|
||||
#define prototype_idct_scalar_add(sym) \
|
||||
void sym(short input, \
|
||||
unsigned char *pred, unsigned char *output, \
|
||||
int pitch, int stride)
|
||||
unsigned char *pred, int pred_stride, \
|
||||
unsigned char *dst, \
|
||||
int dst_stride)
|
||||
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
#include "x86/idct_x86.h"
|
||||
@ -31,11 +33,6 @@
|
||||
#include "arm/idct_arm.h"
|
||||
#endif
|
||||
|
||||
#ifndef vp8_idct_idct1
|
||||
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_c
|
||||
#endif
|
||||
extern prototype_idct(vp8_idct_idct1);
|
||||
|
||||
#ifndef vp8_idct_idct16
|
||||
#define vp8_idct_idct16 vp8_short_idct4x4llm_c
|
||||
#endif
|
||||
@ -63,7 +60,6 @@ typedef prototype_second_order((*vp8_second_order_fn_t));
|
||||
|
||||
typedef struct
|
||||
{
|
||||
vp8_idct_fn_t idct1;
|
||||
vp8_idct_fn_t idct16;
|
||||
vp8_idct_scalar_add_fn_t idct1_scalar_add;
|
||||
|
||||
|
@ -24,28 +24,31 @@
|
||||
**************************************************************************/
|
||||
static const int cospi8sqrt2minus1 = 20091;
|
||||
static const int sinpi8sqrt2 = 35468;
|
||||
static const int rounding = 0;
|
||||
void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
|
||||
|
||||
void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr,
|
||||
int pred_stride, unsigned char *dst_ptr,
|
||||
int dst_stride)
|
||||
{
|
||||
int i;
|
||||
int r, c;
|
||||
int a1, b1, c1, d1;
|
||||
|
||||
short output[16];
|
||||
short *ip = input;
|
||||
short *op = output;
|
||||
int temp1, temp2;
|
||||
int shortpitch = pitch >> 1;
|
||||
int shortpitch = 4;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
a1 = ip[0] + ip[8];
|
||||
b1 = ip[0] - ip[8];
|
||||
|
||||
temp1 = (ip[4] * sinpi8sqrt2 + rounding) >> 16;
|
||||
temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1 + rounding) >> 16);
|
||||
temp1 = (ip[4] * sinpi8sqrt2) >> 16;
|
||||
temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
|
||||
c1 = temp1 - temp2;
|
||||
|
||||
temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1 + rounding) >> 16);
|
||||
temp2 = (ip[12] * sinpi8sqrt2 + rounding) >> 16;
|
||||
temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
|
||||
temp2 = (ip[12] * sinpi8sqrt2) >> 16;
|
||||
d1 = temp1 + temp2;
|
||||
|
||||
op[shortpitch*0] = a1 + d1;
|
||||
@ -66,12 +69,12 @@ void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
|
||||
a1 = ip[0] + ip[2];
|
||||
b1 = ip[0] - ip[2];
|
||||
|
||||
temp1 = (ip[1] * sinpi8sqrt2 + rounding) >> 16;
|
||||
temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1 + rounding) >> 16);
|
||||
temp1 = (ip[1] * sinpi8sqrt2) >> 16;
|
||||
temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
|
||||
c1 = temp1 - temp2;
|
||||
|
||||
temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1 + rounding) >> 16);
|
||||
temp2 = (ip[3] * sinpi8sqrt2 + rounding) >> 16;
|
||||
temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
|
||||
temp2 = (ip[3] * sinpi8sqrt2) >> 16;
|
||||
d1 = temp1 + temp2;
|
||||
|
||||
|
||||
@ -84,27 +87,31 @@ void vp8_short_idct4x4llm_c(short *input, short *output, int pitch)
|
||||
ip += shortpitch;
|
||||
op += shortpitch;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch)
|
||||
{
|
||||
int i;
|
||||
int a1;
|
||||
short *op = output;
|
||||
int shortpitch = pitch >> 1;
|
||||
a1 = ((input[0] + 4) >> 3);
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
ip = output;
|
||||
for (r = 0; r < 4; r++)
|
||||
{
|
||||
op[0] = a1;
|
||||
op[1] = a1;
|
||||
op[2] = a1;
|
||||
op[3] = a1;
|
||||
op += shortpitch;
|
||||
for (c = 0; c < 4; c++)
|
||||
{
|
||||
int a = ip[c] + pred_ptr[c] ;
|
||||
|
||||
if (a < 0)
|
||||
a = 0;
|
||||
|
||||
if (a > 255)
|
||||
a = 255;
|
||||
|
||||
dst_ptr[c] = (unsigned char) a ;
|
||||
}
|
||||
ip += 4;
|
||||
dst_ptr += dst_stride;
|
||||
pred_ptr += pred_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
|
||||
void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
|
||||
int pred_stride, unsigned char *dst_ptr,
|
||||
int dst_stride)
|
||||
{
|
||||
int a1 = ((input_dc + 4) >> 3);
|
||||
int r, c;
|
||||
@ -124,8 +131,8 @@ void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned ch
|
||||
dst_ptr[c] = (unsigned char) a ;
|
||||
}
|
||||
|
||||
dst_ptr += stride;
|
||||
pred_ptr += pitch;
|
||||
dst_ptr += dst_stride;
|
||||
pred_ptr += pred_stride;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -12,6 +12,21 @@
|
||||
#include "invtrans.h"
|
||||
|
||||
|
||||
void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b,
|
||||
int pitch)
|
||||
{
|
||||
if (b->eob > 1)
|
||||
{
|
||||
IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, pitch,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, pitch,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void recon_dcblock(MACROBLOCKD *x)
|
||||
{
|
||||
@ -25,15 +40,6 @@ static void recon_dcblock(MACROBLOCKD *x)
|
||||
|
||||
}
|
||||
|
||||
void vp8_inverse_transform_b(const vp8_idct_rtcd_vtable_t *rtcd, BLOCKD *b, int pitch)
|
||||
{
|
||||
if (b->eob > 1)
|
||||
IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
|
||||
else
|
||||
IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
|
||||
}
|
||||
|
||||
|
||||
void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
|
||||
{
|
||||
int i;
|
||||
@ -45,7 +51,7 @@ void vp8_inverse_transform_mby(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
vp8_inverse_transform_b(rtcd, &x->block[i], 32);
|
||||
vp8_inverse_transform_b(rtcd, &x->block[i], 16);
|
||||
}
|
||||
|
||||
}
|
||||
@ -55,34 +61,10 @@ void vp8_inverse_transform_mbuv(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD
|
||||
|
||||
for (i = 16; i < 24; i++)
|
||||
{
|
||||
vp8_inverse_transform_b(rtcd, &x->block[i], 16);
|
||||
vp8_inverse_transform_b(rtcd, &x->block[i], 8);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void vp8_inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (x->mode_info_context->mbmi.mode != B_PRED &&
|
||||
x->mode_info_context->mbmi.mode != SPLITMV)
|
||||
{
|
||||
/* do 2nd order transform on the dc block */
|
||||
|
||||
IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
|
||||
recon_dcblock(x);
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
vp8_inverse_transform_b(rtcd, &x->block[i], 32);
|
||||
}
|
||||
|
||||
|
||||
for (i = 16; i < 24; i++)
|
||||
{
|
||||
vp8_inverse_transform_b(rtcd, &x->block[i], 16);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -18,7 +18,7 @@
|
||||
void sym(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch)
|
||||
|
||||
#define prototype_recon_block(sym) \
|
||||
void sym(unsigned char *pred, short *diff, unsigned char *dst, int pitch)
|
||||
void sym(unsigned char *pred, short *diff, int diff_stride, unsigned char *dst, int pitch)
|
||||
|
||||
#define prototype_recon_macroblock(sym) \
|
||||
void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x)
|
||||
@ -27,7 +27,7 @@
|
||||
void sym(MACROBLOCKD *x)
|
||||
|
||||
#define prototype_intra4x4_predict(sym) \
|
||||
void sym(BLOCKD *x, int b_mode, unsigned char *predictor)
|
||||
void sym(BLOCKD *x, int b_mode, unsigned char *predictor, int stride)
|
||||
|
||||
struct vp8_recon_rtcd_vtable;
|
||||
|
||||
@ -54,31 +54,6 @@ extern prototype_copy_block(vp8_recon_copy8x8);
|
||||
#endif
|
||||
extern prototype_copy_block(vp8_recon_copy8x4);
|
||||
|
||||
#ifndef vp8_recon_recon
|
||||
#define vp8_recon_recon vp8_recon_b_c
|
||||
#endif
|
||||
extern prototype_recon_block(vp8_recon_recon);
|
||||
|
||||
#ifndef vp8_recon_recon2
|
||||
#define vp8_recon_recon2 vp8_recon2b_c
|
||||
#endif
|
||||
extern prototype_recon_block(vp8_recon_recon2);
|
||||
|
||||
#ifndef vp8_recon_recon4
|
||||
#define vp8_recon_recon4 vp8_recon4b_c
|
||||
#endif
|
||||
extern prototype_recon_block(vp8_recon_recon4);
|
||||
|
||||
#ifndef vp8_recon_recon_mb
|
||||
#define vp8_recon_recon_mb vp8_recon_mb_c
|
||||
#endif
|
||||
extern prototype_recon_macroblock(vp8_recon_recon_mb);
|
||||
|
||||
#ifndef vp8_recon_recon_mby
|
||||
#define vp8_recon_recon_mby vp8_recon_mby_c
|
||||
#endif
|
||||
extern prototype_recon_macroblock(vp8_recon_recon_mby);
|
||||
|
||||
#ifndef vp8_recon_build_intra_predictors_mby
|
||||
#define vp8_recon_build_intra_predictors_mby vp8_build_intra_predictors_mby
|
||||
#endif
|
||||
@ -111,8 +86,6 @@ extern prototype_intra4x4_predict\
|
||||
|
||||
|
||||
typedef prototype_copy_block((*vp8_copy_block_fn_t));
|
||||
typedef prototype_recon_block((*vp8_recon_fn_t));
|
||||
typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
|
||||
typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t));
|
||||
typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t));
|
||||
typedef struct vp8_recon_rtcd_vtable
|
||||
@ -120,11 +93,7 @@ typedef struct vp8_recon_rtcd_vtable
|
||||
vp8_copy_block_fn_t copy16x16;
|
||||
vp8_copy_block_fn_t copy8x8;
|
||||
vp8_copy_block_fn_t copy8x4;
|
||||
vp8_recon_fn_t recon;
|
||||
vp8_recon_fn_t recon2;
|
||||
vp8_recon_fn_t recon4;
|
||||
vp8_recon_mb_fn_t recon_mb;
|
||||
vp8_recon_mb_fn_t recon_mby;
|
||||
|
||||
vp8_build_intra_pred_fn_t build_intra_predictors_mby_s;
|
||||
vp8_build_intra_pred_fn_t build_intra_predictors_mby;
|
||||
vp8_build_intra_pred_fn_t build_intra_predictors_mbuv_s;
|
||||
@ -138,5 +107,4 @@ typedef struct vp8_recon_rtcd_vtable
|
||||
#define RECON_INVOKE(ctx,fn) vp8_recon_##fn
|
||||
#endif
|
||||
|
||||
void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
|
||||
#endif
|
||||
|
@ -123,7 +123,6 @@ void vp8_copy_mem8x4_c(
|
||||
}
|
||||
|
||||
|
||||
|
||||
void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
|
||||
{
|
||||
int r;
|
||||
@ -159,41 +158,73 @@ void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf)
|
||||
}
|
||||
}
|
||||
|
||||
static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, int pitch)
|
||||
static void build_inter_predictors4b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride)
|
||||
{
|
||||
unsigned char *ptr_base;
|
||||
unsigned char *ptr;
|
||||
unsigned char *pred_ptr = d->predictor;
|
||||
|
||||
ptr_base = *(d->base_pre);
|
||||
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
|
||||
|
||||
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
|
||||
{
|
||||
x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
|
||||
x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, pred_ptr, pitch);
|
||||
RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst, dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch)
|
||||
static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, unsigned char *dst, int dst_stride)
|
||||
{
|
||||
unsigned char *ptr_base;
|
||||
unsigned char *ptr;
|
||||
unsigned char *pred_ptr = d->predictor;
|
||||
|
||||
ptr_base = *(d->base_pre);
|
||||
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
|
||||
|
||||
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
|
||||
{
|
||||
x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, pred_ptr, pitch);
|
||||
x->subpixel_predict8x4(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, pred_ptr, pitch);
|
||||
RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d->pre_stride, dst, dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
static void build_inter_predictors_b(BLOCKD *d, unsigned char *dst, int dst_stride, vp8_subpix_fn_t sppf)
|
||||
{
|
||||
int r;
|
||||
unsigned char *ptr_base;
|
||||
unsigned char *ptr;
|
||||
|
||||
ptr_base = *(d->base_pre);
|
||||
|
||||
if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7)
|
||||
{
|
||||
ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
|
||||
sppf(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst, dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
|
||||
ptr = ptr_base;
|
||||
|
||||
for (r = 0; r < 4; r++)
|
||||
{
|
||||
#if !(CONFIG_FAST_UNALIGNED)
|
||||
dst[0] = ptr[0];
|
||||
dst[1] = ptr[1];
|
||||
dst[2] = ptr[2];
|
||||
dst[3] = ptr[3];
|
||||
#else
|
||||
*(uint32_t *)dst = *(uint32_t *)ptr ;
|
||||
#endif
|
||||
dst += dst_stride;
|
||||
ptr += d->pre_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -292,7 +323,7 @@ void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
|
||||
BLOCKD *d1 = &x->block[i+1];
|
||||
|
||||
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
|
||||
build_inter_predictors2b(x, d0, 8);
|
||||
build_inter_predictors2b(x, d0, d0->predictor, 8);
|
||||
else
|
||||
{
|
||||
vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
|
||||
@ -435,6 +466,9 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
|
||||
|
||||
if (x->mode_info_context->mbmi.partitioning < 3)
|
||||
{
|
||||
BLOCKD *b;
|
||||
int dst_stride = x->block[ 0].dst_stride;
|
||||
|
||||
x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
|
||||
x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
|
||||
x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
|
||||
@ -447,10 +481,14 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
|
||||
clamp_mv_to_umv_border(&x->block[10].bmi.mv.as_mv, x);
|
||||
}
|
||||
|
||||
build_inter_predictors4b(x, &x->block[ 0], 16);
|
||||
build_inter_predictors4b(x, &x->block[ 2], 16);
|
||||
build_inter_predictors4b(x, &x->block[ 8], 16);
|
||||
build_inter_predictors4b(x, &x->block[10], 16);
|
||||
b = &x->block[ 0];
|
||||
build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
|
||||
b = &x->block[ 2];
|
||||
build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
|
||||
b = &x->block[ 8];
|
||||
build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
|
||||
b = &x->block[10];
|
||||
build_inter_predictors4b(x, b, *(b->base_dst) + b->dst, dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -458,6 +496,7 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
|
||||
{
|
||||
BLOCKD *d0 = &x->block[i];
|
||||
BLOCKD *d1 = &x->block[i+1];
|
||||
int dst_stride = x->block[ 0].dst_stride;
|
||||
|
||||
x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
|
||||
x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
|
||||
@ -468,11 +507,11 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
|
||||
}
|
||||
|
||||
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
|
||||
build_inter_predictors2b(x, d0, 16);
|
||||
build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride);
|
||||
else
|
||||
{
|
||||
vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict);
|
||||
vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict);
|
||||
build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict);
|
||||
build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict);
|
||||
}
|
||||
|
||||
}
|
||||
@ -483,15 +522,16 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
|
||||
{
|
||||
BLOCKD *d0 = &x->block[i];
|
||||
BLOCKD *d1 = &x->block[i+1];
|
||||
int dst_stride = x->block[ 16].dst_stride;
|
||||
|
||||
/* Note: uv mvs already clamped in build_4x4uvmvs() */
|
||||
|
||||
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
|
||||
build_inter_predictors2b(x, d0, 8);
|
||||
build_inter_predictors2b(x, d0, *(d0->base_dst) + d0->dst, dst_stride);
|
||||
else
|
||||
{
|
||||
vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
|
||||
vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
|
||||
build_inter_predictors_b(d0, *(d0->base_dst) + d0->dst, dst_stride, x->subpixel_predict);
|
||||
build_inter_predictors_b(d1, *(d1->base_dst) + d1->dst, dst_stride, x->subpixel_predict);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -542,17 +582,83 @@ void build_4x4uvmvs(MACROBLOCKD *x)
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
|
||||
void vp8_build_inter_predictors_mb(MACROBLOCKD *xd)
|
||||
{
|
||||
if (x->mode_info_context->mbmi.mode != SPLITMV)
|
||||
if (xd->mode_info_context->mbmi.mode != SPLITMV)
|
||||
{
|
||||
vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256],
|
||||
&x->predictor[320], 16, 8);
|
||||
vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer,
|
||||
xd->dst.u_buffer, xd->dst.v_buffer,
|
||||
xd->dst.y_stride, xd->dst.uv_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
build_4x4uvmvs(x);
|
||||
build_inter4x4_predictors_mb(x);
|
||||
build_4x4uvmvs(xd);
|
||||
build_inter4x4_predictors_mb(xd);
|
||||
}
|
||||
}
|
||||
/* encoder only*/
|
||||
static void build_inter4x4_predictors_mb_e(MACROBLOCKD *x)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (x->mode_info_context->mbmi.partitioning < 3)
|
||||
{
|
||||
x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
|
||||
x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
|
||||
x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
|
||||
x->block[10].bmi = x->mode_info_context->bmi[10];
|
||||
|
||||
build_inter_predictors4b(x, &x->block[ 0], x->block[ 0].predictor, 16);
|
||||
build_inter_predictors4b(x, &x->block[ 2], x->block[ 2].predictor, 16);
|
||||
build_inter_predictors4b(x, &x->block[ 8], x->block[ 8].predictor, 16);
|
||||
build_inter_predictors4b(x, &x->block[10], x->block[10].predictor, 16);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = 0; i < 16; i += 2)
|
||||
{
|
||||
BLOCKD *d0 = &x->block[i];
|
||||
BLOCKD *d1 = &x->block[i+1];
|
||||
|
||||
x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
|
||||
x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
|
||||
|
||||
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
|
||||
build_inter_predictors2b(x, d0, d0->predictor, 16);
|
||||
else
|
||||
{
|
||||
build_inter_predictors_b(d0, d0->predictor, 16, x->subpixel_predict);
|
||||
build_inter_predictors_b(d1, d1->predictor, 16, x->subpixel_predict);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (i = 16; i < 24; i += 2)
|
||||
{
|
||||
BLOCKD *d0 = &x->block[i];
|
||||
BLOCKD *d1 = &x->block[i+1];
|
||||
|
||||
if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
|
||||
build_inter_predictors2b(x, d0, d0->predictor, 8);
|
||||
else
|
||||
{
|
||||
build_inter_predictors_b(d0, d0->predictor, 8, x->subpixel_predict);
|
||||
build_inter_predictors_b(d1, d1->predictor, 8, x->subpixel_predict);
|
||||
}
|
||||
}
|
||||
}
|
||||
void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd)
|
||||
{
|
||||
if (xd->mode_info_context->mbmi.mode != SPLITMV)
|
||||
{
|
||||
vp8_build_inter16x16_predictors_mb(xd, xd->predictor, &xd->predictor[256],
|
||||
&xd->predictor[320], 16, 8);
|
||||
}
|
||||
else
|
||||
{
|
||||
build_4x4uvmvs(xd);
|
||||
build_inter4x4_predictors_mb_e(xd);
|
||||
}
|
||||
}
|
||||
|
@ -26,5 +26,6 @@ extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t s
|
||||
|
||||
extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
|
||||
extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
|
||||
extern void vp8_build_inter_predictors_mb_e(MACROBLOCKD *xd);
|
||||
|
||||
#endif
|
||||
|
@ -17,16 +17,6 @@
|
||||
/* For skip_recon_mb(), add vp8_build_intra_predictors_mby_s(MACROBLOCKD *x) and
|
||||
* vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x).
|
||||
*/
|
||||
void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 16; i < 24; i += 2)
|
||||
{
|
||||
BLOCKD *b = &x->block[i];
|
||||
RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_build_intra_predictors_mby(MACROBLOCKD *x)
|
||||
{
|
||||
|
@ -16,7 +16,7 @@
|
||||
|
||||
void vp8_intra4x4_predict(BLOCKD *x,
|
||||
int b_mode,
|
||||
unsigned char *predictor)
|
||||
unsigned char *predictor, int stride)
|
||||
{
|
||||
int i, r, c;
|
||||
|
||||
@ -50,7 +50,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
|
||||
predictor[c] = expected_dc;
|
||||
}
|
||||
|
||||
predictor += 16;
|
||||
predictor += stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -72,7 +72,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
|
||||
predictor[c] = pred;
|
||||
}
|
||||
|
||||
predictor += 16;
|
||||
predictor += stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -94,7 +94,7 @@ void vp8_intra4x4_predict(BLOCKD *x,
|
||||
predictor[c] = ap[c];
|
||||
}
|
||||
|
||||
predictor += 16;
|
||||
predictor += stride;
|
||||
}
|
||||
|
||||
}
|
||||
@ -117,29 +117,29 @@ void vp8_intra4x4_predict(BLOCKD *x,
|
||||
predictor[c] = lp[r];
|
||||
}
|
||||
|
||||
predictor += 16;
|
||||
predictor += stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case B_LD_PRED:
|
||||
{
|
||||
unsigned char *ptr = Above;
|
||||
predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
|
||||
predictor[0 * 16 + 1] =
|
||||
predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
|
||||
predictor[0 * 16 + 2] =
|
||||
predictor[1 * 16 + 1] =
|
||||
predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] =
|
||||
predictor[1 * 16 + 2] =
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
|
||||
predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
|
||||
predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
|
||||
predictor[0 * stride + 1] =
|
||||
predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
|
||||
predictor[0 * stride + 2] =
|
||||
predictor[1 * stride + 1] =
|
||||
predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
|
||||
predictor[0 * stride + 3] =
|
||||
predictor[1 * stride + 2] =
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
|
||||
predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
|
||||
|
||||
}
|
||||
break;
|
||||
@ -158,22 +158,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
|
||||
pp[7] = Above[2];
|
||||
pp[8] = Above[3];
|
||||
|
||||
predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[3 * 16 + 1] =
|
||||
predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[3 * 16 + 2] =
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[3 * 16 + 3] =
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[1 * 16 + 1] =
|
||||
predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[1 * 16 + 2] =
|
||||
predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
|
||||
predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[3 * stride + 1] =
|
||||
predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[3 * stride + 2] =
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[3 * stride + 3] =
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[1 * stride + 1] =
|
||||
predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[1 * stride + 2] =
|
||||
predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
|
||||
|
||||
}
|
||||
break;
|
||||
@ -193,22 +193,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
|
||||
pp[8] = Above[3];
|
||||
|
||||
|
||||
predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[3 * 16 + 1] =
|
||||
predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
|
||||
predictor[3 * 16 + 2] =
|
||||
predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
|
||||
predictor[3 * 16 + 3] =
|
||||
predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
|
||||
predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
|
||||
predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[3 * stride + 1] =
|
||||
predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1;
|
||||
predictor[3 * stride + 2] =
|
||||
predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1;
|
||||
predictor[3 * stride + 3] =
|
||||
predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1;
|
||||
predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
|
||||
predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1;
|
||||
|
||||
}
|
||||
break;
|
||||
@ -217,22 +217,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
|
||||
|
||||
unsigned char *pp = Above;
|
||||
|
||||
predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[2 * 16 + 0] =
|
||||
predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[1 * 16 + 1] =
|
||||
predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[3 * 16 + 1] =
|
||||
predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] =
|
||||
predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[2 * stride + 0] =
|
||||
predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[1 * stride + 1] =
|
||||
predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[3 * stride + 1] =
|
||||
predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[0 * stride + 3] =
|
||||
predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
}
|
||||
break;
|
||||
|
||||
@ -250,22 +250,22 @@ void vp8_intra4x4_predict(BLOCKD *x,
|
||||
pp[8] = Above[3];
|
||||
|
||||
|
||||
predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[2 * 16 + 0] =
|
||||
predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[1 * 16 + 2] =
|
||||
predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[2 * stride + 0] =
|
||||
predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[1 * stride + 2] =
|
||||
predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
}
|
||||
break;
|
||||
|
||||
@ -273,28 +273,33 @@ void vp8_intra4x4_predict(BLOCKD *x,
|
||||
case B_HU_PRED:
|
||||
{
|
||||
unsigned char *pp = Left;
|
||||
predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[0 * 16 + 2] =
|
||||
predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[0 * 16 + 3] =
|
||||
predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[1 * 16 + 2] =
|
||||
predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[3 * 16 + 0] =
|
||||
predictor[3 * 16 + 1] =
|
||||
predictor[3 * 16 + 2] =
|
||||
predictor[3 * 16 + 3] = pp[3];
|
||||
predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[0 * stride + 2] =
|
||||
predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[0 * stride + 3] =
|
||||
predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[1 * stride + 2] =
|
||||
predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[3 * stride + 0] =
|
||||
predictor[3 * stride + 1] =
|
||||
predictor[3 * stride + 2] =
|
||||
predictor[3 * stride + 3] = pp[3];
|
||||
}
|
||||
break;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/* copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
|
||||
* to the right prediction have filled in pixels to use.
|
||||
*/
|
||||
|
@ -20,7 +20,6 @@
|
||||
*/
|
||||
|
||||
#if HAVE_MMX
|
||||
extern prototype_idct(vp8_short_idct4x4llm_1_mmx);
|
||||
extern prototype_idct(vp8_short_idct4x4llm_mmx);
|
||||
extern prototype_idct_scalar_add(vp8_dc_only_idct_add_mmx);
|
||||
|
||||
@ -28,9 +27,6 @@ extern prototype_second_order(vp8_short_inv_walsh4x4_mmx);
|
||||
extern prototype_second_order(vp8_short_inv_walsh4x4_1_mmx);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_idct_idct1
|
||||
#define vp8_idct_idct1 vp8_short_idct4x4llm_1_mmx
|
||||
|
||||
#undef vp8_idct_idct16
|
||||
#define vp8_idct_idct16 vp8_short_idct4x4llm_mmx
|
||||
|
||||
|
@ -32,197 +32,10 @@
|
||||
; **************************************************************************/
|
||||
|
||||
|
||||
;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
|
||||
;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
|
||||
;int pitch, unsigned char *dest,int stride)
|
||||
global sym(vp8_short_idct4x4llm_mmx)
|
||||
sym(vp8_short_idct4x4llm_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 3
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
mov rax, arg(0) ;input
|
||||
mov rdx, arg(1) ;output
|
||||
|
||||
movq mm0, [rax ]
|
||||
movq mm1, [rax+ 8]
|
||||
|
||||
movq mm2, [rax+16]
|
||||
movq mm3, [rax+24]
|
||||
|
||||
movsxd rax, dword ptr arg(2) ;pitch
|
||||
|
||||
psubw mm0, mm2 ; b1= 0-2
|
||||
paddw mm2, mm2 ;
|
||||
|
||||
movq mm5, mm1
|
||||
paddw mm2, mm0 ; a1 =0+2
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_s1sqr2)] ;
|
||||
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movq mm7, mm3 ;
|
||||
pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ;
|
||||
|
||||
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw mm7, mm5 ; c1
|
||||
|
||||
movq mm5, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
|
||||
paddw mm5, mm1
|
||||
|
||||
pmulhw mm3, [GLOBAL(x_s1sqr2)]
|
||||
paddw mm3, mm4
|
||||
|
||||
paddw mm3, mm5 ; d1
|
||||
movq mm6, mm2 ; a1
|
||||
|
||||
movq mm4, mm0 ; b1
|
||||
paddw mm2, mm3 ;0
|
||||
|
||||
paddw mm4, mm7 ;1
|
||||
psubw mm0, mm7 ;2
|
||||
|
||||
psubw mm6, mm3 ;3
|
||||
|
||||
movq mm1, mm2 ; 03 02 01 00
|
||||
movq mm3, mm4 ; 23 22 21 20
|
||||
|
||||
punpcklwd mm1, mm0 ; 11 01 10 00
|
||||
punpckhwd mm2, mm0 ; 13 03 12 02
|
||||
|
||||
punpcklwd mm3, mm6 ; 31 21 30 20
|
||||
punpckhwd mm4, mm6 ; 33 23 32 22
|
||||
|
||||
movq mm0, mm1 ; 11 01 10 00
|
||||
movq mm5, mm2 ; 13 03 12 02
|
||||
|
||||
punpckldq mm0, mm3 ; 30 20 10 00
|
||||
punpckhdq mm1, mm3 ; 31 21 11 01
|
||||
|
||||
punpckldq mm2, mm4 ; 32 22 12 02
|
||||
punpckhdq mm5, mm4 ; 33 23 13 03
|
||||
|
||||
movq mm3, mm5 ; 33 23 13 03
|
||||
|
||||
psubw mm0, mm2 ; b1= 0-2
|
||||
paddw mm2, mm2 ;
|
||||
|
||||
movq mm5, mm1
|
||||
paddw mm2, mm0 ; a1 =0+2
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_s1sqr2)] ;
|
||||
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movq mm7, mm3 ;
|
||||
pmulhw mm7, [GLOBAL(x_c1sqr2less1)] ;
|
||||
|
||||
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw mm7, mm5 ; c1
|
||||
|
||||
movq mm5, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
|
||||
paddw mm5, mm1
|
||||
|
||||
pmulhw mm3, [GLOBAL(x_s1sqr2)]
|
||||
paddw mm3, mm4
|
||||
|
||||
paddw mm3, mm5 ; d1
|
||||
paddw mm0, [GLOBAL(fours)]
|
||||
|
||||
paddw mm2, [GLOBAL(fours)]
|
||||
movq mm6, mm2 ; a1
|
||||
|
||||
movq mm4, mm0 ; b1
|
||||
paddw mm2, mm3 ;0
|
||||
|
||||
paddw mm4, mm7 ;1
|
||||
psubw mm0, mm7 ;2
|
||||
|
||||
psubw mm6, mm3 ;3
|
||||
psraw mm2, 3
|
||||
|
||||
psraw mm0, 3
|
||||
psraw mm4, 3
|
||||
|
||||
psraw mm6, 3
|
||||
|
||||
movq mm1, mm2 ; 03 02 01 00
|
||||
movq mm3, mm4 ; 23 22 21 20
|
||||
|
||||
punpcklwd mm1, mm0 ; 11 01 10 00
|
||||
punpckhwd mm2, mm0 ; 13 03 12 02
|
||||
|
||||
punpcklwd mm3, mm6 ; 31 21 30 20
|
||||
punpckhwd mm4, mm6 ; 33 23 32 22
|
||||
|
||||
movq mm0, mm1 ; 11 01 10 00
|
||||
movq mm5, mm2 ; 13 03 12 02
|
||||
|
||||
punpckldq mm0, mm3 ; 30 20 10 00
|
||||
punpckhdq mm1, mm3 ; 31 21 11 01
|
||||
|
||||
punpckldq mm2, mm4 ; 32 22 12 02
|
||||
punpckhdq mm5, mm4 ; 33 23 13 03
|
||||
|
||||
movq [rdx], mm0
|
||||
|
||||
movq [rdx+rax], mm1
|
||||
movq [rdx+rax*2], mm2
|
||||
|
||||
add rdx, rax
|
||||
movq [rdx+rax*2], mm5
|
||||
|
||||
; begin epilog
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
|
||||
global sym(vp8_short_idct4x4llm_1_mmx)
|
||||
sym(vp8_short_idct4x4llm_1_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 3
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
mov rax, arg(0) ;input
|
||||
movd mm0, [rax]
|
||||
|
||||
paddw mm0, [GLOBAL(fours)]
|
||||
mov rdx, arg(1) ;output
|
||||
|
||||
psraw mm0, 3
|
||||
movsxd rax, dword ptr arg(2) ;pitch
|
||||
|
||||
punpcklwd mm0, mm0
|
||||
punpckldq mm0, mm0
|
||||
|
||||
movq [rdx], mm0
|
||||
movq [rdx+rax], mm0
|
||||
|
||||
movq [rdx+rax*2], mm0
|
||||
add rdx, rax
|
||||
|
||||
movq [rdx+rax*2], mm0
|
||||
|
||||
|
||||
; begin epilog
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
|
||||
global sym(vp8_dc_only_idct_add_mmx)
|
||||
sym(vp8_dc_only_idct_add_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
@ -231,46 +44,171 @@ sym(vp8_dc_only_idct_add_mmx):
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(1) ;s -- prediction
|
||||
mov rdi, arg(2) ;d -- destination
|
||||
movsxd rax, dword ptr arg(4) ;stride
|
||||
movsxd rdx, dword ptr arg(3) ;pitch
|
||||
pxor mm0, mm0
|
||||
mov rax, arg(0) ;input
|
||||
mov rsi, arg(1) ;pred
|
||||
|
||||
movd mm5, arg(0) ;input_dc
|
||||
movq mm0, [rax ]
|
||||
movq mm1, [rax+ 8]
|
||||
movq mm2, [rax+16]
|
||||
movq mm3, [rax+24]
|
||||
|
||||
paddw mm5, [GLOBAL(fours)]
|
||||
%if 0
|
||||
pxor mm7, mm7
|
||||
movq [rax], mm7
|
||||
movq [rax+8], mm7
|
||||
movq [rax+16],mm7
|
||||
movq [rax+24],mm7
|
||||
%endif
|
||||
movsxd rax, dword ptr arg(2) ;pitch
|
||||
mov rdx, arg(3) ;dest
|
||||
movsxd rdi, dword ptr arg(4) ;stride
|
||||
|
||||
psraw mm5, 3
|
||||
|
||||
punpcklwd mm5, mm5
|
||||
punpckldq mm5, mm5
|
||||
psubw mm0, mm2 ; b1= 0-2
|
||||
paddw mm2, mm2 ;
|
||||
|
||||
movd mm1, [rsi]
|
||||
punpcklbw mm1, mm0
|
||||
paddsw mm1, mm5
|
||||
packuswb mm1, mm0 ; pack and unpack to saturate
|
||||
movd [rdi], mm1
|
||||
movq mm5, mm1
|
||||
paddw mm2, mm0 ; a1 =0+2
|
||||
|
||||
movd mm2, [rsi+rdx]
|
||||
punpcklbw mm2, mm0
|
||||
paddsw mm2, mm5
|
||||
packuswb mm2, mm0 ; pack and unpack to saturate
|
||||
movd [rdi+rax], mm2
|
||||
pmulhw mm5, [GLOBAL(x_s1sqr2)];
|
||||
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movd mm3, [rsi+2*rdx]
|
||||
punpcklbw mm3, mm0
|
||||
paddsw mm3, mm5
|
||||
packuswb mm3, mm0 ; pack and unpack to saturate
|
||||
movd [rdi+2*rax], mm3
|
||||
movq mm7, mm3 ;
|
||||
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
|
||||
|
||||
add rdi, rax
|
||||
add rsi, rdx
|
||||
movd mm4, [rsi+2*rdx]
|
||||
punpcklbw mm4, mm0
|
||||
paddsw mm4, mm5
|
||||
packuswb mm4, mm0 ; pack and unpack to saturate
|
||||
movd [rdi+2*rax], mm4
|
||||
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw mm7, mm5 ; c1
|
||||
|
||||
movq mm5, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
|
||||
paddw mm5, mm1
|
||||
|
||||
pmulhw mm3, [GLOBAL(x_s1sqr2)]
|
||||
paddw mm3, mm4
|
||||
|
||||
paddw mm3, mm5 ; d1
|
||||
movq mm6, mm2 ; a1
|
||||
|
||||
movq mm4, mm0 ; b1
|
||||
paddw mm2, mm3 ;0
|
||||
|
||||
paddw mm4, mm7 ;1
|
||||
psubw mm0, mm7 ;2
|
||||
|
||||
psubw mm6, mm3 ;3
|
||||
|
||||
movq mm1, mm2 ; 03 02 01 00
|
||||
movq mm3, mm4 ; 23 22 21 20
|
||||
|
||||
punpcklwd mm1, mm0 ; 11 01 10 00
|
||||
punpckhwd mm2, mm0 ; 13 03 12 02
|
||||
|
||||
punpcklwd mm3, mm6 ; 31 21 30 20
|
||||
punpckhwd mm4, mm6 ; 33 23 32 22
|
||||
|
||||
movq mm0, mm1 ; 11 01 10 00
|
||||
movq mm5, mm2 ; 13 03 12 02
|
||||
|
||||
punpckldq mm0, mm3 ; 30 20 10 00
|
||||
punpckhdq mm1, mm3 ; 31 21 11 01
|
||||
|
||||
punpckldq mm2, mm4 ; 32 22 12 02
|
||||
punpckhdq mm5, mm4 ; 33 23 13 03
|
||||
|
||||
movq mm3, mm5 ; 33 23 13 03
|
||||
|
||||
psubw mm0, mm2 ; b1= 0-2
|
||||
paddw mm2, mm2 ;
|
||||
|
||||
movq mm5, mm1
|
||||
paddw mm2, mm0 ; a1 =0+2
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_s1sqr2)];
|
||||
paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movq mm7, mm3 ;
|
||||
pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
|
||||
|
||||
paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
|
||||
psubw mm7, mm5 ; c1
|
||||
|
||||
movq mm5, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
|
||||
paddw mm5, mm1
|
||||
|
||||
pmulhw mm3, [GLOBAL(x_s1sqr2)]
|
||||
paddw mm3, mm4
|
||||
|
||||
paddw mm3, mm5 ; d1
|
||||
paddw mm0, [GLOBAL(fours)]
|
||||
|
||||
paddw mm2, [GLOBAL(fours)]
|
||||
movq mm6, mm2 ; a1
|
||||
|
||||
movq mm4, mm0 ; b1
|
||||
paddw mm2, mm3 ;0
|
||||
|
||||
paddw mm4, mm7 ;1
|
||||
psubw mm0, mm7 ;2
|
||||
|
||||
psubw mm6, mm3 ;3
|
||||
psraw mm2, 3
|
||||
|
||||
psraw mm0, 3
|
||||
psraw mm4, 3
|
||||
|
||||
psraw mm6, 3
|
||||
|
||||
movq mm1, mm2 ; 03 02 01 00
|
||||
movq mm3, mm4 ; 23 22 21 20
|
||||
|
||||
punpcklwd mm1, mm0 ; 11 01 10 00
|
||||
punpckhwd mm2, mm0 ; 13 03 12 02
|
||||
|
||||
punpcklwd mm3, mm6 ; 31 21 30 20
|
||||
punpckhwd mm4, mm6 ; 33 23 32 22
|
||||
|
||||
movq mm0, mm1 ; 11 01 10 00
|
||||
movq mm5, mm2 ; 13 03 12 02
|
||||
|
||||
punpckldq mm0, mm3 ; 30 20 10 00
|
||||
punpckhdq mm1, mm3 ; 31 21 11 01
|
||||
|
||||
punpckldq mm2, mm4 ; 32 22 12 02
|
||||
punpckhdq mm5, mm4 ; 33 23 13 03
|
||||
|
||||
pxor mm7, mm7
|
||||
|
||||
movd mm4, [rsi]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm0, mm4
|
||||
packuswb mm0, mm7
|
||||
movd [rdx], mm0
|
||||
|
||||
movd mm4, [rsi+rax]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm1, mm4
|
||||
packuswb mm1, mm7
|
||||
movd [rdx+rdi], mm1
|
||||
|
||||
movd mm4, [rsi+2*rax]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm2, mm4
|
||||
packuswb mm2, mm7
|
||||
movd [rdx+rdi*2], mm2
|
||||
|
||||
add rdx, rdi
|
||||
add rsi, rax
|
||||
|
||||
movd mm4, [rsi+2*rax]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm5, mm4
|
||||
packuswb mm5, mm7
|
||||
movd [rdx+rdi*2], mm5
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@ -280,6 +218,71 @@ sym(vp8_dc_only_idct_add_mmx):
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_dc_only_idct_add_mmx(
|
||||
;short input_dc,
|
||||
;unsigned char *pred_ptr,
|
||||
;int pred_stride,
|
||||
;unsigned char *dst_ptr,
|
||||
;int stride)
|
||||
global sym(vp8_dc_only_idct_add_mmx)
|
||||
sym(vp8_dc_only_idct_add_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
movd mm5, arg(0) ;input_dc
|
||||
mov rax, arg(1) ;pred_ptr
|
||||
movsxd rdx, dword ptr arg(2) ;pred_stride
|
||||
|
||||
pxor mm0, mm0
|
||||
|
||||
paddw mm5, [GLOBAL(fours)]
|
||||
lea rcx, [rdx + rdx*2]
|
||||
|
||||
psraw mm5, 3
|
||||
|
||||
punpcklwd mm5, mm5
|
||||
|
||||
punpckldq mm5, mm5
|
||||
|
||||
movd mm1, [rax]
|
||||
movd mm2, [rax+rdx]
|
||||
movd mm3, [rax+2*rdx]
|
||||
movd mm4, [rax+rcx]
|
||||
|
||||
mov rax, arg(3) ;d -- destination
|
||||
movsxd rdx, dword ptr arg(4) ;dst_stride
|
||||
|
||||
punpcklbw mm1, mm0
|
||||
paddsw mm1, mm5
|
||||
packuswb mm1, mm0 ; pack and unpack to saturate
|
||||
lea rcx, [rdx + rdx*2]
|
||||
|
||||
punpcklbw mm2, mm0
|
||||
paddsw mm2, mm5
|
||||
packuswb mm2, mm0 ; pack and unpack to saturate
|
||||
|
||||
punpcklbw mm3, mm0
|
||||
paddsw mm3, mm5
|
||||
packuswb mm3, mm0 ; pack and unpack to saturate
|
||||
|
||||
punpcklbw mm4, mm0
|
||||
paddsw mm4, mm5
|
||||
packuswb mm4, mm0 ; pack and unpack to saturate
|
||||
|
||||
movd [rax], mm1
|
||||
movd [rax+rdx], mm2
|
||||
movd [rax+2*rdx], mm3
|
||||
movd [rax+rcx], mm4
|
||||
|
||||
; begin epilog
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
SECTION_RODATA
|
||||
align 16
|
||||
x_s1sqr2:
|
||||
|
@ -15,17 +15,15 @@
|
||||
; (
|
||||
; short *qcoeff - 0
|
||||
; short *dequant - 1
|
||||
; unsigned char *pre - 2
|
||||
; unsigned char *dst - 3
|
||||
; int dst_stride - 4
|
||||
; int blk_stride - 5
|
||||
; unsigned char *dst - 2
|
||||
; int dst_stride - 3
|
||||
; )
|
||||
|
||||
global sym(vp8_idct_dequant_0_2x_sse2)
|
||||
sym(vp8_idct_dequant_0_2x_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
GET_GOT rbx
|
||||
; end prolog
|
||||
|
||||
@ -47,19 +45,20 @@ sym(vp8_idct_dequant_0_2x_sse2):
|
||||
movd [rax], xmm5
|
||||
movd [rax+32], xmm5
|
||||
;pshufb
|
||||
mov rax, arg(2) ; dst
|
||||
movsxd rdx, dword ptr arg(3) ; dst_stride
|
||||
|
||||
pshuflw xmm4, xmm4, 00000000b
|
||||
pshufhw xmm4, xmm4, 00000000b
|
||||
|
||||
mov rax, arg(2) ; pre
|
||||
lea rcx, [rdx + rdx*2]
|
||||
paddw xmm4, [GLOBAL(fours)]
|
||||
|
||||
movsxd rcx, dword ptr arg(5) ; blk_stride
|
||||
psraw xmm4, 3
|
||||
|
||||
movq xmm0, [rax]
|
||||
movq xmm1, [rax+rcx]
|
||||
movq xmm2, [rax+2*rcx]
|
||||
lea rcx, [3*rcx]
|
||||
movq xmm1, [rax+rdx]
|
||||
movq xmm2, [rax+2*rdx]
|
||||
movq xmm3, [rax+rcx]
|
||||
|
||||
punpcklbw xmm0, xmm5
|
||||
@ -67,8 +66,6 @@ sym(vp8_idct_dequant_0_2x_sse2):
|
||||
punpcklbw xmm2, xmm5
|
||||
punpcklbw xmm3, xmm5
|
||||
|
||||
mov rax, arg(3) ; dst
|
||||
movsxd rdx, dword ptr arg(4) ; dst_stride
|
||||
|
||||
; Add to predict buffer
|
||||
paddw xmm0, xmm4
|
||||
@ -97,11 +94,18 @@ sym(vp8_idct_dequant_0_2x_sse2):
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_idct_dequant_full_2x_sse2
|
||||
; (
|
||||
; short *qcoeff - 0
|
||||
; short *dequant - 1
|
||||
; unsigned char *dst - 2
|
||||
; int dst_stride - 3
|
||||
; )
|
||||
global sym(vp8_idct_dequant_full_2x_sse2)
|
||||
sym(vp8_idct_dequant_full_2x_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
@ -111,14 +115,13 @@ sym(vp8_idct_dequant_full_2x_sse2):
|
||||
; special case when 2 blocks have 0 or 1 coeffs
|
||||
; dc is set as first coeff, so no need to load qcoeff
|
||||
mov rax, arg(0) ; qcoeff
|
||||
mov rsi, arg(2) ; pre
|
||||
mov rdi, arg(3) ; dst
|
||||
movsxd rcx, dword ptr arg(5) ; blk_stride
|
||||
mov rdx, arg(1) ; dequant
|
||||
mov rdi, arg(2) ; dst
|
||||
|
||||
|
||||
; Zero out xmm7, for use unpacking
|
||||
pxor xmm7, xmm7
|
||||
|
||||
mov rdx, arg(1) ; dequant
|
||||
|
||||
; note the transpose of xmm1 and xmm2, necessary for shuffle
|
||||
; to spit out sensicle data
|
||||
@ -138,6 +141,7 @@ sym(vp8_idct_dequant_full_2x_sse2):
|
||||
pmullw xmm2, [rdx+16]
|
||||
pmullw xmm1, [rdx]
|
||||
pmullw xmm3, [rdx+16]
|
||||
movsxd rdx, dword ptr arg(3) ; dst_stride
|
||||
|
||||
; repack so block 0 row x and block 1 row x are together
|
||||
movdqa xmm4, xmm0
|
||||
@ -162,6 +166,7 @@ sym(vp8_idct_dequant_full_2x_sse2):
|
||||
paddw xmm2, xmm0 ; a1 = 0+2
|
||||
|
||||
pmulhw xmm5, [GLOBAL(x_s1sqr2)]
|
||||
lea rcx, [rdx + rdx*2] ;dst_stride * 3
|
||||
paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)
|
||||
|
||||
movdqa xmm7, xmm3
|
||||
@ -304,8 +309,8 @@ sym(vp8_idct_dequant_full_2x_sse2):
|
||||
pxor xmm7, xmm7
|
||||
|
||||
; Load up predict blocks
|
||||
movq xmm4, [rsi]
|
||||
movq xmm5, [rsi+rcx]
|
||||
movq xmm4, [rdi]
|
||||
movq xmm5, [rdi+rdx]
|
||||
|
||||
punpcklbw xmm4, xmm7
|
||||
punpcklbw xmm5, xmm7
|
||||
@ -313,9 +318,8 @@ sym(vp8_idct_dequant_full_2x_sse2):
|
||||
paddw xmm0, xmm4
|
||||
paddw xmm1, xmm5
|
||||
|
||||
movq xmm4, [rsi+2*rcx]
|
||||
lea rcx, [3*rcx]
|
||||
movq xmm5, [rsi+rcx]
|
||||
movq xmm4, [rdi+2*rdx]
|
||||
movq xmm5, [rdi+rcx]
|
||||
|
||||
punpcklbw xmm4, xmm7
|
||||
punpcklbw xmm5, xmm7
|
||||
@ -331,18 +335,11 @@ sym(vp8_idct_dequant_full_2x_sse2):
|
||||
packuswb xmm2, xmm7
|
||||
packuswb xmm3, xmm7
|
||||
|
||||
; Load destination stride before writing out,
|
||||
; doesn't need to persist
|
||||
movsxd rdx, dword ptr arg(4) ; dst_stride
|
||||
|
||||
; store blocks back out
|
||||
movq [rdi], xmm0
|
||||
movq [rdi + rdx], xmm1
|
||||
|
||||
lea rdi, [rdi + 2*rdx]
|
||||
|
||||
movq [rdi], xmm2
|
||||
movq [rdi + rdx], xmm3
|
||||
movq [rdi + rdx*2], xmm2
|
||||
movq [rdi + rcx], xmm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
@ -357,27 +354,25 @@ sym(vp8_idct_dequant_full_2x_sse2):
|
||||
; (
|
||||
; short *qcoeff - 0
|
||||
; short *dequant - 1
|
||||
; unsigned char *pre - 2
|
||||
; unsigned char *dst - 3
|
||||
; int dst_stride - 4
|
||||
; short *dc - 5
|
||||
; unsigned char *dst - 2
|
||||
; int dst_stride - 3
|
||||
; short *dc - 4
|
||||
; )
|
||||
global sym(vp8_idct_dequant_dc_0_2x_sse2)
|
||||
sym(vp8_idct_dequant_dc_0_2x_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
; special case when 2 blocks have 0 or 1 coeffs
|
||||
; dc is set as first coeff, so no need to load qcoeff
|
||||
mov rax, arg(0) ; qcoeff
|
||||
mov rsi, arg(2) ; pre
|
||||
mov rdi, arg(3) ; dst
|
||||
mov rdx, arg(5) ; dc
|
||||
|
||||
mov rdi, arg(2) ; dst
|
||||
mov rdx, arg(4) ; dc
|
||||
|
||||
; Zero out xmm5, for use unpacking
|
||||
pxor xmm5, xmm5
|
||||
@ -385,11 +380,13 @@ sym(vp8_idct_dequant_dc_0_2x_sse2):
|
||||
; load up 2 dc words here == 2*16 = doubleword
|
||||
movd xmm4, [rdx]
|
||||
|
||||
movsxd rdx, dword ptr arg(3) ; dst_stride
|
||||
lea rcx, [rdx + rdx*2]
|
||||
; Load up predict blocks
|
||||
movq xmm0, [rsi]
|
||||
movq xmm1, [rsi+16]
|
||||
movq xmm2, [rsi+32]
|
||||
movq xmm3, [rsi+48]
|
||||
movq xmm0, [rdi]
|
||||
movq xmm1, [rdi+rdx*1]
|
||||
movq xmm2, [rdi+rdx*2]
|
||||
movq xmm3, [rdi+rcx]
|
||||
|
||||
; Duplicate and expand dc across
|
||||
punpcklwd xmm4, xmm4
|
||||
@ -417,48 +414,46 @@ sym(vp8_idct_dequant_dc_0_2x_sse2):
|
||||
packuswb xmm2, xmm5
|
||||
packuswb xmm3, xmm5
|
||||
|
||||
; Load destination stride before writing out,
|
||||
; doesn't need to persist
|
||||
movsxd rdx, dword ptr arg(4) ; dst_stride
|
||||
|
||||
; store blocks back out
|
||||
movq [rdi], xmm0
|
||||
movq [rdi + rdx], xmm1
|
||||
|
||||
lea rdi, [rdi + 2*rdx]
|
||||
|
||||
movq [rdi], xmm2
|
||||
movq [rdi + rdx], xmm3
|
||||
movq [rdi + rdx*2], xmm2
|
||||
movq [rdi + rcx], xmm3
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_idct_dequant_dc_full_2x_sse2
|
||||
; (
|
||||
; short *qcoeff - 0
|
||||
; short *dequant - 1
|
||||
; unsigned char *dst - 2
|
||||
; int dst_stride - 3
|
||||
; short *dc - 4
|
||||
; )
|
||||
global sym(vp8_idct_dequant_dc_full_2x_sse2)
|
||||
sym(vp8_idct_dequant_dc_full_2x_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
; special case when 2 blocks have 0 or 1 coeffs
|
||||
; dc is set as first coeff, so no need to load qcoeff
|
||||
mov rax, arg(0) ; qcoeff
|
||||
mov rsi, arg(2) ; pre
|
||||
mov rdi, arg(3) ; dst
|
||||
mov rdx, arg(1) ; dequant
|
||||
|
||||
mov rdi, arg(2) ; dst
|
||||
|
||||
; Zero out xmm7, for use unpacking
|
||||
pxor xmm7, xmm7
|
||||
|
||||
mov rdx, arg(1) ; dequant
|
||||
|
||||
; note the transpose of xmm1 and xmm2, necessary for shuffle
|
||||
; to spit out sensicle data
|
||||
@ -480,7 +475,7 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
|
||||
pmullw xmm3, [rdx+16]
|
||||
|
||||
; DC component
|
||||
mov rdx, arg(5)
|
||||
mov rdx, arg(4)
|
||||
|
||||
; repack so block 0 row x and block 1 row x are together
|
||||
movdqa xmm4, xmm0
|
||||
@ -651,8 +646,10 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
|
||||
pxor xmm7, xmm7
|
||||
|
||||
; Load up predict blocks
|
||||
movq xmm4, [rsi]
|
||||
movq xmm5, [rsi+16]
|
||||
movsxd rdx, dword ptr arg(3) ; dst_stride
|
||||
movq xmm4, [rdi]
|
||||
movq xmm5, [rdi+rdx]
|
||||
lea rcx, [rdx + rdx*2]
|
||||
|
||||
punpcklbw xmm4, xmm7
|
||||
punpcklbw xmm5, xmm7
|
||||
@ -660,8 +657,8 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
|
||||
paddw xmm0, xmm4
|
||||
paddw xmm1, xmm5
|
||||
|
||||
movq xmm4, [rsi+32]
|
||||
movq xmm5, [rsi+48]
|
||||
movq xmm4, [rdi+rdx*2]
|
||||
movq xmm5, [rdi+rcx]
|
||||
|
||||
punpcklbw xmm4, xmm7
|
||||
punpcklbw xmm5, xmm7
|
||||
@ -679,7 +676,7 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
|
||||
|
||||
; Load destination stride before writing out,
|
||||
; doesn't need to persist
|
||||
movsxd rdx, dword ptr arg(4) ; dst_stride
|
||||
movsxd rdx, dword ptr arg(3) ; dst_stride
|
||||
|
||||
; store blocks back out
|
||||
movq [rdi], xmm0
|
||||
@ -693,7 +690,6 @@ sym(vp8_idct_dequant_dc_full_2x_sse2):
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
|
@ -10,53 +10,6 @@
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
;void vp8_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
|
||||
global sym(vp8_recon_b_mmx)
|
||||
sym(vp8_recon_b_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;s
|
||||
mov rdi, arg(2) ;d
|
||||
mov rdx, arg(1) ;q
|
||||
movsxd rax, dword ptr arg(3) ;stride
|
||||
pxor mm0, mm0
|
||||
|
||||
movd mm1, [rsi]
|
||||
punpcklbw mm1, mm0
|
||||
paddsw mm1, [rdx]
|
||||
packuswb mm1, mm0 ; pack and unpack to saturate
|
||||
movd [rdi], mm1
|
||||
|
||||
movd mm2, [rsi+16]
|
||||
punpcklbw mm2, mm0
|
||||
paddsw mm2, [rdx+32]
|
||||
packuswb mm2, mm0 ; pack and unpack to saturate
|
||||
movd [rdi+rax], mm2
|
||||
|
||||
movd mm3, [rsi+32]
|
||||
punpcklbw mm3, mm0
|
||||
paddsw mm3, [rdx+64]
|
||||
packuswb mm3, mm0 ; pack and unpack to saturate
|
||||
movd [rdi+2*rax], mm3
|
||||
|
||||
add rdi, rax
|
||||
movd mm4, [rsi+48]
|
||||
punpcklbw mm4, mm0
|
||||
paddsw mm4, [rdx+96]
|
||||
packuswb mm4, mm0 ; pack and unpack to saturate
|
||||
movd [rdi+2*rax], mm4
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void copy_mem8x8_mmx(
|
||||
|
@ -10,121 +10,6 @@
|
||||
|
||||
|
||||
%include "vpx_ports/x86_abi_support.asm"
|
||||
;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
|
||||
global sym(vp8_recon2b_sse2)
|
||||
sym(vp8_recon2b_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;s
|
||||
mov rdi, arg(2) ;d
|
||||
mov rdx, arg(1) ;q
|
||||
movsxd rax, dword ptr arg(3) ;stride
|
||||
pxor xmm0, xmm0
|
||||
|
||||
movq xmm1, MMWORD PTR [rsi]
|
||||
punpcklbw xmm1, xmm0
|
||||
paddsw xmm1, XMMWORD PTR [rdx]
|
||||
packuswb xmm1, xmm0 ; pack and unpack to saturate
|
||||
movq MMWORD PTR [rdi], xmm1
|
||||
|
||||
|
||||
movq xmm2, MMWORD PTR [rsi+8]
|
||||
punpcklbw xmm2, xmm0
|
||||
paddsw xmm2, XMMWORD PTR [rdx+16]
|
||||
packuswb xmm2, xmm0 ; pack and unpack to saturate
|
||||
movq MMWORD PTR [rdi+rax], xmm2
|
||||
|
||||
|
||||
movq xmm3, MMWORD PTR [rsi+16]
|
||||
punpcklbw xmm3, xmm0
|
||||
paddsw xmm3, XMMWORD PTR [rdx+32]
|
||||
packuswb xmm3, xmm0 ; pack and unpack to saturate
|
||||
movq MMWORD PTR [rdi+rax*2], xmm3
|
||||
|
||||
add rdi, rax
|
||||
movq xmm4, MMWORD PTR [rsi+24]
|
||||
punpcklbw xmm4, xmm0
|
||||
paddsw xmm4, XMMWORD PTR [rdx+48]
|
||||
packuswb xmm4, xmm0 ; pack and unpack to saturate
|
||||
movq MMWORD PTR [rdi+rax*2], xmm4
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
|
||||
global sym(vp8_recon4b_sse2)
|
||||
sym(vp8_recon4b_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SAVE_XMM 7
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(0) ;s
|
||||
mov rdi, arg(2) ;d
|
||||
mov rdx, arg(1) ;q
|
||||
movsxd rax, dword ptr arg(3) ;stride
|
||||
pxor xmm0, xmm0
|
||||
|
||||
movdqa xmm1, XMMWORD PTR [rsi]
|
||||
movdqa xmm5, xmm1
|
||||
punpcklbw xmm1, xmm0
|
||||
punpckhbw xmm5, xmm0
|
||||
paddsw xmm1, XMMWORD PTR [rdx]
|
||||
paddsw xmm5, XMMWORD PTR [rdx+16]
|
||||
packuswb xmm1, xmm5 ; pack and unpack to saturate
|
||||
movdqa XMMWORD PTR [rdi], xmm1
|
||||
|
||||
|
||||
movdqa xmm2, XMMWORD PTR [rsi+16]
|
||||
movdqa xmm6, xmm2
|
||||
punpcklbw xmm2, xmm0
|
||||
punpckhbw xmm6, xmm0
|
||||
paddsw xmm2, XMMWORD PTR [rdx+32]
|
||||
paddsw xmm6, XMMWORD PTR [rdx+48]
|
||||
packuswb xmm2, xmm6 ; pack and unpack to saturate
|
||||
movdqa XMMWORD PTR [rdi+rax], xmm2
|
||||
|
||||
|
||||
movdqa xmm3, XMMWORD PTR [rsi+32]
|
||||
movdqa xmm7, xmm3
|
||||
punpcklbw xmm3, xmm0
|
||||
punpckhbw xmm7, xmm0
|
||||
paddsw xmm3, XMMWORD PTR [rdx+64]
|
||||
paddsw xmm7, XMMWORD PTR [rdx+80]
|
||||
packuswb xmm3, xmm7 ; pack and unpack to saturate
|
||||
movdqa XMMWORD PTR [rdi+rax*2], xmm3
|
||||
|
||||
add rdi, rax
|
||||
movdqa xmm4, XMMWORD PTR [rsi+48]
|
||||
movdqa xmm5, xmm4
|
||||
punpcklbw xmm4, xmm0
|
||||
punpckhbw xmm5, xmm0
|
||||
paddsw xmm4, XMMWORD PTR [rdx+96]
|
||||
paddsw xmm5, XMMWORD PTR [rdx+112]
|
||||
packuswb xmm4, xmm5 ; pack and unpack to saturate
|
||||
movdqa XMMWORD PTR [rdi+rax*2], xmm4
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void copy_mem16x16_sse2(
|
||||
; unsigned char *src,
|
||||
|
@ -20,16 +20,12 @@
|
||||
*/
|
||||
|
||||
#if HAVE_MMX
|
||||
extern prototype_recon_block(vp8_recon_b_mmx);
|
||||
extern prototype_copy_block(vp8_copy_mem8x8_mmx);
|
||||
extern prototype_copy_block(vp8_copy_mem8x4_mmx);
|
||||
extern prototype_copy_block(vp8_copy_mem16x16_mmx);
|
||||
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_recon_recon
|
||||
#define vp8_recon_recon vp8_recon_b_mmx
|
||||
|
||||
#undef vp8_recon_copy8x8
|
||||
#define vp8_recon_copy8x8 vp8_copy_mem8x8_mmx
|
||||
|
||||
@ -43,19 +39,11 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx);
|
||||
#endif
|
||||
|
||||
#if HAVE_SSE2
|
||||
extern prototype_recon_block(vp8_recon2b_sse2);
|
||||
extern prototype_recon_block(vp8_recon4b_sse2);
|
||||
extern prototype_copy_block(vp8_copy_mem16x16_sse2);
|
||||
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2);
|
||||
extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2);
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_recon_recon2
|
||||
#define vp8_recon_recon2 vp8_recon2b_sse2
|
||||
|
||||
#undef vp8_recon_recon4
|
||||
#define vp8_recon_recon4 vp8_recon4b_sse2
|
||||
|
||||
#undef vp8_recon_copy16x16
|
||||
#define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2
|
||||
|
||||
|
@ -37,7 +37,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
|
||||
|
||||
if (flags & HAS_MMX)
|
||||
{
|
||||
rtcd->idct.idct1 = vp8_short_idct4x4llm_1_mmx;
|
||||
rtcd->idct.idct16 = vp8_short_idct4x4llm_mmx;
|
||||
rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_mmx;
|
||||
rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_mmx;
|
||||
@ -45,7 +44,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
|
||||
|
||||
|
||||
|
||||
rtcd->recon.recon = vp8_recon_b_mmx;
|
||||
rtcd->recon.copy8x8 = vp8_copy_mem8x8_mmx;
|
||||
rtcd->recon.copy8x4 = vp8_copy_mem8x4_mmx;
|
||||
rtcd->recon.copy16x16 = vp8_copy_mem16x16_mmx;
|
||||
@ -81,8 +79,6 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx)
|
||||
|
||||
if (flags & HAS_SSE2)
|
||||
{
|
||||
rtcd->recon.recon2 = vp8_recon2b_sse2;
|
||||
rtcd->recon.recon4 = vp8_recon4b_sse2;
|
||||
rtcd->recon.copy16x16 = vp8_copy_mem16x16_sse2;
|
||||
rtcd->recon.build_intra_predictors_mbuv =
|
||||
vp8_build_intra_predictors_mbuv_sse2;
|
||||
|
@ -12,21 +12,19 @@
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
;void vp8_dequant_dc_idct_v6(short *input, short *dq, unsigned char *pred,
|
||||
; unsigned char *dest, int pitch, int stride, int Dc)
|
||||
;void vp8_dequant_dc_idct_v6(short *input, short *dq,
|
||||
; unsigned char *dest, int stride, int Dc)
|
||||
; r0 = input
|
||||
; r1 = dq
|
||||
; r2 = pred
|
||||
; r3 = dest
|
||||
; sp + 36 = pitch ; +4 = 40
|
||||
; sp + 40 = stride ; +4 = 44
|
||||
; sp + 44 = Dc ; +4 = 48
|
||||
; r2 = dst
|
||||
; r3 = stride
|
||||
; sp + 36 = Dc
|
||||
|
||||
|
||||
|vp8_dequant_dc_idct_add_v6| PROC
|
||||
stmdb sp!, {r4-r11, lr}
|
||||
|
||||
ldr r6, [sp, #44]
|
||||
ldr r6, [sp, #36]
|
||||
|
||||
ldr r4, [r0] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
@ -149,7 +147,7 @@ vp8_dequant_dc_idct_loop2_v6
|
||||
usub16 r1, r12, r8
|
||||
uadd16 r8, r11, r6
|
||||
ldr r9, c0x00040004
|
||||
ldr r12, [sp, #40]
|
||||
ldr r12, [sp] ; get stride from stack
|
||||
uadd16 r6, r10, r8
|
||||
usub16 r7, r10, r8
|
||||
uadd16 r7, r7, r9
|
||||
@ -158,7 +156,7 @@ vp8_dequant_dc_idct_loop2_v6
|
||||
usub16 r1, r14, r1
|
||||
uadd16 r10, r10, r9
|
||||
uadd16 r1, r1, r9
|
||||
ldr r11, [r2], r12
|
||||
ldr r11, [r2] ; load input from dst
|
||||
mov r8, r7, asr #3
|
||||
pkhtb r9, r8, r10, asr #19
|
||||
mov r8, r1, asr #3
|
||||
@ -170,9 +168,7 @@ vp8_dequant_dc_idct_loop2_v6
|
||||
usat16 r9, #8, r9
|
||||
usat16 r8, #8, r8
|
||||
orr r9, r8, r9, lsl #8
|
||||
ldr r11, [r2], r12
|
||||
ldr lr, [sp]
|
||||
ldr r12, [sp, #44]
|
||||
ldr r11, [r2, r12] ; load input from dst
|
||||
mov r7, r7, lsl #16
|
||||
mov r1, r1, lsl #16
|
||||
mov r10, r10, lsl #16
|
||||
@ -188,9 +184,8 @@ vp8_dequant_dc_idct_loop2_v6
|
||||
usat16 r7, #8, r7
|
||||
usat16 r1, #8, r1
|
||||
orr r1, r1, r7, lsl #8
|
||||
str r9, [lr], r12
|
||||
str r1, [lr], r12
|
||||
str lr, [sp]
|
||||
str r9, [r2], r12 ; store output to dst
|
||||
str r1, [r2], r12 ; store output to dst
|
||||
bne vp8_dequant_dc_idct_loop2_v6
|
||||
|
||||
; vpx_memset
|
||||
|
@ -10,15 +10,12 @@
|
||||
EXPORT |vp8_dequant_idct_add_v6|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
;void vp8_dequant_idct_v6(short *input, short *dq, unsigned char *pred,
|
||||
; unsigned char *dest, int pitch, int stride)
|
||||
; r0 = input
|
||||
;void vp8_dequant_idct_v6(short *input, short *dq,
|
||||
; unsigned char *dest, int stride)
|
||||
; r0 = q
|
||||
; r1 = dq
|
||||
; r2 = pred
|
||||
; r3 = dest
|
||||
; sp + 36 = pitch ; +4 = 40
|
||||
; sp + 40 = stride ; +4 = 44
|
||||
|
||||
; r2 = dst
|
||||
; r3 = stride
|
||||
|
||||
|vp8_dequant_idct_add_v6| PROC
|
||||
stmdb sp!, {r4-r11, lr}
|
||||
@ -127,7 +124,7 @@ vp8_dequant_idct_loop2_v6
|
||||
usub16 r1, r12, r8
|
||||
uadd16 r8, r11, r6
|
||||
ldr r9, c0x00040004
|
||||
ldr r12, [sp, #40]
|
||||
ldr r12, [sp] ; get stride from stack
|
||||
uadd16 r6, r10, r8
|
||||
usub16 r7, r10, r8
|
||||
uadd16 r7, r7, r9
|
||||
@ -136,7 +133,7 @@ vp8_dequant_idct_loop2_v6
|
||||
usub16 r1, r14, r1
|
||||
uadd16 r10, r10, r9
|
||||
uadd16 r1, r1, r9
|
||||
ldr r11, [r2], r12
|
||||
ldr r11, [r2] ; load input from dst
|
||||
mov r8, r7, asr #3
|
||||
pkhtb r9, r8, r10, asr #19
|
||||
mov r8, r1, asr #3
|
||||
@ -148,9 +145,7 @@ vp8_dequant_idct_loop2_v6
|
||||
usat16 r9, #8, r9
|
||||
usat16 r8, #8, r8
|
||||
orr r9, r8, r9, lsl #8
|
||||
ldr r11, [r2], r12
|
||||
ldr lr, [sp]
|
||||
ldr r12, [sp, #44]
|
||||
ldr r11, [r2, r12] ; load input from dst
|
||||
mov r7, r7, lsl #16
|
||||
mov r1, r1, lsl #16
|
||||
mov r10, r10, lsl #16
|
||||
@ -166,9 +161,8 @@ vp8_dequant_idct_loop2_v6
|
||||
usat16 r7, #8, r7
|
||||
usat16 r1, #8, r1
|
||||
orr r1, r1, r7, lsl #8
|
||||
str r9, [lr], r12
|
||||
str r1, [lr], r12
|
||||
str lr, [sp]
|
||||
str r9, [r2], r12 ; store output to dst
|
||||
str r1, [r2], r12 ; store output to dst
|
||||
bne vp8_dequant_idct_loop2_v6
|
||||
|
||||
; vpx_memset
|
||||
|
@ -12,115 +12,121 @@
|
||||
#include "vp8/common/idct.h"
|
||||
#include "vp8/decoder/dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_v6
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
|
||||
unsigned char *dst, int stride,
|
||||
char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_dc_idct_add_v6 (q, dq, pre, dst, 16, stride, dc[0]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6 (dc[0], pre, dst, 16, stride);
|
||||
vp8_dequant_dc_idct_add_v6 (q, dq, dst, stride, dc[0]);
|
||||
else if (eobs[0] == 1)
|
||||
vp8_dc_only_idct_add_v6 (dc[0], dst, stride, dst, stride);
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_dc_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6 (dc[1], pre+4, dst+4, 16, stride);
|
||||
{
|
||||
vp8_dequant_dc_idct_add_v6 (q+16, dq, dst+4, stride, dc[1]);
|
||||
}
|
||||
else if (eobs[1] == 1)
|
||||
vp8_dc_only_idct_add_v6 (dc[1], dst+4, stride, dst+4, stride);
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_dc_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6 (dc[2], pre+8, dst+8, 16, stride);
|
||||
{
|
||||
vp8_dequant_dc_idct_add_v6 (q+32, dq, dst+8, stride, dc[2]);
|
||||
}
|
||||
else if (eobs[2] == 1)
|
||||
vp8_dc_only_idct_add_v6 (dc[2], dst+8, stride, dst+8, stride);
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_dc_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
|
||||
else
|
||||
vp8_dc_only_idct_add_v6 (dc[3], pre+12, dst+12, 16, stride);
|
||||
{
|
||||
vp8_dequant_dc_idct_add_v6 (q+48, dq, dst+12, stride, dc[3]);
|
||||
}
|
||||
else if (eobs[3] == 1)
|
||||
vp8_dc_only_idct_add_v6 (dc[3], dst+12, stride, dst+12, stride);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_v6
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
void vp8_dequant_idct_add_y_block_v6(short *q, short *dq,
|
||||
unsigned char *dst,
|
||||
int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6 (q, dq, pre, dst, 16, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_v6 (q, dq, dst, stride);
|
||||
else if (eobs[0] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dst, 16, stride);
|
||||
vp8_dc_only_idct_add_v6 (q[0]*dq[0], dst, stride, dst, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dst+4, 16, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_v6 (q+16, dq, dst+4, stride);
|
||||
else if (eobs[1] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dst+4, 16, stride);
|
||||
vp8_dc_only_idct_add_v6 (q[16]*dq[0], dst+4, stride, dst+4, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+32, dq, pre+8, dst+8, 16, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_v6 (q+32, dq, dst+8, stride);
|
||||
else if (eobs[2] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[32]*dq[0], pre+8, dst+8, 16, stride);
|
||||
vp8_dc_only_idct_add_v6 (q[32]*dq[0], dst+8, stride, dst+8, stride);
|
||||
((int *)(q+32))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+48, dq, pre+12, dst+12, 16, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_v6 (q+48, dq, dst+12, stride);
|
||||
else if (eobs[3] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[48]*dq[0], pre+12, dst+12, 16, stride);
|
||||
vp8_dc_only_idct_add_v6 (q[48]*dq[0], dst+12, stride,dst+12,stride);
|
||||
((int *)(q+48))[0] = 0;
|
||||
}
|
||||
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_v6
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq,
|
||||
unsigned char *dstu,
|
||||
unsigned char *dstv,
|
||||
int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6 (q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_v6 (q, dq, dstu, stride);
|
||||
else if (eobs[0] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstu, 8, stride);
|
||||
vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstu, stride, dstu, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstu+4, 8, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_v6 (q+16, dq, dstu+4, stride);
|
||||
else if (eobs[1] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstu+4, 8, stride);
|
||||
vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstu+4, stride,
|
||||
dstu+4, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
@ -128,23 +134,23 @@ void vp8_dequant_idct_add_uv_block_v6
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6 (q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_v6 (q, dq, dstv, stride);
|
||||
else if (eobs[0] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[0]*dq[0], pre, dstv, 8, stride);
|
||||
vp8_dc_only_idct_add_v6 (q[0]*dq[0], dstv, stride, dstv, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6 (q+16, dq, pre+4, dstv+4, 8, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_v6 (q+16, dq, dstv+4, stride);
|
||||
else if (eobs[1] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_v6 (q[16]*dq[0], pre+4, dstv+4, 8, stride);
|
||||
vp8_dc_only_idct_add_v6 (q[16]*dq[0], dstv+4, stride,
|
||||
dstv+4, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
|
@ -49,6 +49,7 @@ extern prototype_dequant_dc_idct_add_y_block(vp8_dequant_dc_idct_add_y_block_neo
|
||||
extern prototype_dequant_idct_add_y_block(vp8_dequant_idct_add_y_block_neon);
|
||||
extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
|
||||
|
||||
|
||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||
#undef vp8_dequant_block
|
||||
#define vp8_dequant_block vp8_dequantize_b_neon
|
||||
@ -68,6 +69,7 @@ extern prototype_dequant_idct_add_uv_block(vp8_dequant_idct_add_uv_block_neon);
|
||||
#undef vp8_dequant_idct_add_uv_block
|
||||
#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -15,25 +15,24 @@
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void vp8_dequant_idct_neon(short *input, short *dq, unsigned char *pred,
|
||||
; unsigned char *dest, int pitch, int stride)
|
||||
;void vp8_dequant_idct_add_neon(short *input, short *dq,
|
||||
; unsigned char *dest, int stride)
|
||||
; r0 short *input,
|
||||
; r1 short *dq,
|
||||
; r2 unsigned char *pred
|
||||
; r3 unsigned char *dest
|
||||
; sp int pitch
|
||||
; sp+4 int stride
|
||||
; r2 unsigned char *dest
|
||||
; r3 int stride
|
||||
|
||||
|vp8_dequant_idct_add_neon| PROC
|
||||
vld1.16 {q3, q4}, [r0]
|
||||
vld1.16 {q5, q6}, [r1]
|
||||
ldr r1, [sp] ; pitch
|
||||
vld1.32 {d14[0]}, [r2], r1
|
||||
vld1.32 {d14[1]}, [r2], r1
|
||||
vld1.32 {d15[0]}, [r2], r1
|
||||
vld1.32 {d15[1]}, [r2]
|
||||
|
||||
ldr r1, [sp, #4] ; stride
|
||||
add r1, r2, r3 ; r1 = dest + stride
|
||||
lsl r3, #1 ; 2x stride
|
||||
|
||||
vld1.32 {d14[0]}, [r2], r3
|
||||
vld1.32 {d14[1]}, [r1], r3
|
||||
vld1.32 {d15[0]}, [r2]
|
||||
vld1.32 {d15[1]}, [r1]
|
||||
|
||||
adr r12, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
@ -110,13 +109,16 @@
|
||||
vaddw.u8 q1, q1, d14
|
||||
vaddw.u8 q2, q2, d15
|
||||
|
||||
sub r2, r2, r3
|
||||
sub r1, r1, r3
|
||||
|
||||
vqmovun.s16 d0, q1
|
||||
vqmovun.s16 d1, q2
|
||||
|
||||
vst1.32 {d0[0]}, [r3], r1
|
||||
vst1.32 {d0[1]}, [r3], r1
|
||||
vst1.32 {d1[0]}, [r3], r1
|
||||
vst1.32 {d1[1]}, [r3]
|
||||
vst1.32 {d0[0]}, [r2], r3
|
||||
vst1.32 {d0[1]}, [r1], r3
|
||||
vst1.32 {d1[0]}, [r2]
|
||||
vst1.32 {d1[1]}, [r1]
|
||||
|
||||
bx lr
|
||||
|
||||
|
@ -15,101 +15,118 @@
|
||||
/* place these declarations here because we don't want to maintain them
|
||||
* outside of this scope
|
||||
*/
|
||||
void idct_dequant_dc_full_2x_neon
|
||||
(short *input, short *dq, unsigned char *pre, unsigned char *dst,
|
||||
int stride, short *dc);
|
||||
void idct_dequant_dc_0_2x_neon
|
||||
(short *dc, unsigned char *pre, unsigned char *dst, int stride);
|
||||
void idct_dequant_full_2x_neon
|
||||
(short *q, short *dq, unsigned char *pre, unsigned char *dst,
|
||||
int pitch, int stride);
|
||||
void idct_dequant_0_2x_neon
|
||||
(short *q, short dq, unsigned char *pre, int pitch,
|
||||
unsigned char *dst, int stride);
|
||||
void idct_dequant_dc_full_2x_neon(short *input, short *dq,
|
||||
unsigned char *dst,
|
||||
int stride, short *dc);
|
||||
void idct_dequant_dc_0_2x_neon(short *input, short *dq,
|
||||
unsigned char *dst,
|
||||
int stride, short *dc);
|
||||
void idct_dequant_full_2x_neon(short *q, short *dq,
|
||||
unsigned char *dst, int stride);
|
||||
void idct_dequant_0_2x_neon(short *q, short dq,
|
||||
unsigned char *dst, int stride);
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_neon
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
void vp8_dequant_dc_idct_add_y_block_neon(short *q, short *dq,
|
||||
unsigned char *dst,
|
||||
int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_neon (q, dq, pre, dst, stride, dc);
|
||||
else
|
||||
idct_dequant_dc_0_2x_neon(dc, pre, dst, stride);
|
||||
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_neon (q+32, dq, pre+8, dst+8, stride, dc+2);
|
||||
else
|
||||
idct_dequant_dc_0_2x_neon(dc+2, pre+8, dst+8, stride);
|
||||
if (((short *)(eobs))[0])
|
||||
{
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_neon (q, dq, dst, stride, dc);
|
||||
else
|
||||
idct_dequant_dc_0_2x_neon(q, dq, dst, stride, dc);
|
||||
}
|
||||
|
||||
if (((short *)(eobs))[1])
|
||||
{
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_dc_full_2x_neon (q+32, dq, dst+8, stride, dc+2);
|
||||
else
|
||||
idct_dequant_dc_0_2x_neon(q+32, dq, dst+8, stride, dc+2);
|
||||
}
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_neon
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
void vp8_dequant_idct_add_y_block_neon(short *q, short *dq,
|
||||
unsigned char *dst,
|
||||
int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, pre, dst, 16, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], pre, 16, dst, stride);
|
||||
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q+32, dq, pre+8, dst+8, 16, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q+32, dq[0], pre+8, 16, dst+8, stride);
|
||||
if (((short *)(eobs))[0])
|
||||
{
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, dst, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], dst, stride);
|
||||
}
|
||||
|
||||
if (((short *)(eobs))[1])
|
||||
{
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q+32, dq, dst+8, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q+32, dq[0], dst+8, stride);
|
||||
}
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_neon
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
|
||||
unsigned char *dstu,
|
||||
unsigned char *dstv,
|
||||
int stride, char *eobs)
|
||||
{
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
|
||||
if (((short *)(eobs))[0])
|
||||
{
|
||||
if (((short *)eobs)[0] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, dstu, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += 4*stride;
|
||||
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstu, stride);
|
||||
if (((short *)(eobs))[1])
|
||||
{
|
||||
if (((short *)eobs)[1] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, dstu, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], dstu, stride);
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
|
||||
if (((short *)eobs)[2] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
|
||||
if (((short *)(eobs))[2])
|
||||
{
|
||||
if (((short *)eobs)[2] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, dstv, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += 4*stride;
|
||||
|
||||
if (((short *)eobs)[3] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], pre, 8, dstv, stride);
|
||||
if (((short *)(eobs))[3])
|
||||
{
|
||||
if (((short *)eobs)[3] & 0xfefe)
|
||||
idct_dequant_full_2x_neon (q, dq, dstv, stride);
|
||||
else
|
||||
idct_dequant_0_2x_neon (q, dq[0], dstv, stride);
|
||||
}
|
||||
}
|
||||
|
@ -14,38 +14,38 @@
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *pre,
|
||||
; int pitch, unsigned char *dst, int stride);
|
||||
;void idct_dequant_0_2x_neon(short *q, short dq,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *q
|
||||
; r1 dq
|
||||
; r2 *pre
|
||||
; r3 pitch
|
||||
; sp *dst
|
||||
; sp+4 stride
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
|idct_dequant_0_2x_neon| PROC
|
||||
push {r4, r5}
|
||||
|
||||
add r12, r2, #4
|
||||
vld1.32 {d2[0]}, [r2], r3
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d4[0]}, [r2], r3
|
||||
vld1.32 {d4[1]}, [r2]
|
||||
vld1.32 {d8[0]}, [r12], r3
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d8[1]}, [r12], r3
|
||||
vld1.32 {d4[0]}, [r2], r3
|
||||
vld1.32 {d10[0]}, [r12], r3
|
||||
vld1.32 {d10[1]}, [r12]
|
||||
vld1.32 {d4[1]}, [r2], r3
|
||||
vld1.32 {d10[1]}, [r12], r3
|
||||
|
||||
ldrh r12, [r0] ; lo q
|
||||
ldrh r2, [r0, #32] ; hi q
|
||||
mov r3, #0
|
||||
strh r3, [r0]
|
||||
strh r3, [r0, #32]
|
||||
ldrh r4, [r0, #32] ; hi q
|
||||
mov r5, #0
|
||||
strh r5, [r0]
|
||||
strh r5, [r0, #32]
|
||||
|
||||
sxth r12, r12 ; lo
|
||||
mul r0, r12, r1
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q0, r0
|
||||
sxth r2, r2 ; hi
|
||||
mul r0, r2, r1
|
||||
sxth r4, r4 ; hi
|
||||
mul r0, r4, r1
|
||||
add r0, r0, #4
|
||||
asr r0, r0, #3
|
||||
vdup.16 q3, r0
|
||||
@ -55,25 +55,25 @@
|
||||
vaddw.u8 q4, q3, d8 ; hi
|
||||
vaddw.u8 q5, q3, d10
|
||||
|
||||
ldr r2, [sp] ; dst
|
||||
ldr r3, [sp, #4] ; stride
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r0, r2, #4
|
||||
|
||||
vqmovun.s16 d2, q1 ; lo
|
||||
vqmovun.s16 d4, q2
|
||||
vqmovun.s16 d8, q4 ; hi
|
||||
vqmovun.s16 d10, q5
|
||||
|
||||
add r0, r2, #4
|
||||
vst1.32 {d2[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d2[1]}, [r2], r3
|
||||
vst1.32 {d4[0]}, [r2], r3
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d8[0]}, [r0], r3 ; hi
|
||||
vst1.32 {d2[1]}, [r2], r3
|
||||
vst1.32 {d8[1]}, [r0], r3
|
||||
vst1.32 {d4[0]}, [r2], r3
|
||||
vst1.32 {d10[0]}, [r0], r3
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d10[1]}, [r0]
|
||||
|
||||
bx lr
|
||||
pop {r4, r5}
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_0_2x_neon|
|
||||
ENDP ; |idct_dequant_0_2x_neon|
|
||||
END
|
||||
|
@ -14,25 +14,29 @@
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_dc_0_2x_neon(short *dc, unsigned char *pre,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *dc
|
||||
; r1 *pre
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
|idct_dequant_dc_0_2x_neon| PROC
|
||||
ldr r0, [r0] ; *dc
|
||||
mov r12, #16
|
||||
|
||||
vld1.32 {d2[0]}, [r1], r12 ; lo
|
||||
vld1.32 {d2[1]}, [r1], r12
|
||||
vld1.32 {d4[0]}, [r1], r12
|
||||
vld1.32 {d4[1]}, [r1]
|
||||
sub r1, r1, #44
|
||||
vld1.32 {d8[0]}, [r1], r12 ; hi
|
||||
vld1.32 {d8[1]}, [r1], r12
|
||||
vld1.32 {d10[0]}, [r1], r12
|
||||
vld1.32 {d10[1]}, [r1]
|
||||
;void idct_dequant_dc_0_2x_neon(short *q, short *dq,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *q,
|
||||
; r1 *dq,
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
; sp *dc
|
||||
|idct_dequant_dc_0_2x_neon| PROC
|
||||
|
||||
; no q- or dq-coeffs, so r0 and r1 are free to use
|
||||
ldr r1, [sp] ; *dc
|
||||
add r12, r2, #4
|
||||
ldr r0, [r1]
|
||||
|
||||
vld1.32 {d2[0]}, [r2], r3 ; lo
|
||||
vld1.32 {d8[0]}, [r12], r3 ; hi
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d8[1]}, [r12], r3
|
||||
vld1.32 {d4[0]}, [r2], r3
|
||||
vld1.32 {d10[0]}, [r12], r3
|
||||
vld1.32 {d4[1]}, [r2], r3
|
||||
vld1.32 {d10[1]}, [r12]
|
||||
|
||||
sxth r1, r0 ; lo *dc
|
||||
add r1, r1, #4
|
||||
@ -53,14 +57,16 @@
|
||||
vqmovun.s16 d8, q4 ; hi
|
||||
vqmovun.s16 d10, q5
|
||||
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r0, r2, #4
|
||||
|
||||
vst1.32 {d2[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d2[1]}, [r2], r3
|
||||
vst1.32 {d4[0]}, [r2], r3
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d8[0]}, [r0], r3 ; hi
|
||||
vst1.32 {d2[1]}, [r2], r3
|
||||
vst1.32 {d8[1]}, [r0], r3
|
||||
vst1.32 {d4[0]}, [r2], r3
|
||||
vst1.32 {d10[0]}, [r0], r3
|
||||
vst1.32 {d4[1]}, [r2]
|
||||
vst1.32 {d10[1]}, [r0]
|
||||
|
||||
bx lr
|
||||
|
@ -15,33 +15,34 @@
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_dc_full_2x_neon(short *q, short *dq, unsigned char *pre,
|
||||
;void idct_dequant_dc_full_2x_neon(short *q, short *dq,
|
||||
; unsigned char *dst, int stride, short *dc);
|
||||
; r0 *q,
|
||||
; r1 *dq,
|
||||
; r2 *pre
|
||||
; r3 *dst
|
||||
; sp stride
|
||||
; sp+4 *dc
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
; sp *dc
|
||||
|idct_dequant_dc_full_2x_neon| PROC
|
||||
push {r4}
|
||||
|
||||
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
|
||||
vld1.16 {q2, q3}, [r0] ; l q
|
||||
mov r1, #16 ; pitch
|
||||
add r0, r0, #32
|
||||
vld1.16 {q4, q5}, [r0] ; r q
|
||||
add r12, r2, #4
|
||||
|
||||
; interleave the predictors
|
||||
vld1.32 {d28[0]}, [r2], r1 ; l pre
|
||||
vld1.32 {d28[1]}, [r12], r1 ; r pre
|
||||
vld1.32 {d29[0]}, [r2], r1
|
||||
vld1.32 {d29[1]}, [r12], r1
|
||||
vld1.32 {d30[0]}, [r2], r1
|
||||
vld1.32 {d30[1]}, [r12], r1
|
||||
vld1.32 {d31[0]}, [r2]
|
||||
ldr r1, [sp, #4]
|
||||
vld1.32 {d28[0]}, [r2], r3 ; l pre
|
||||
vld1.32 {d28[1]}, [r12], r3 ; r pre
|
||||
vld1.32 {d29[0]}, [r2], r3
|
||||
vld1.32 {d29[1]}, [r12], r3
|
||||
vld1.32 {d30[0]}, [r2], r3
|
||||
vld1.32 {d30[1]}, [r12], r3
|
||||
vld1.32 {d31[0]}, [r2], r3
|
||||
ldr r1, [sp, #4] ; *dc
|
||||
vld1.32 {d31[1]}, [r12]
|
||||
|
||||
adr r2, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
adr r4, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
ldrh r12, [r1], #2 ; lo *dc
|
||||
ldrh r1, [r1] ; hi *dc
|
||||
@ -56,7 +57,7 @@
|
||||
vmov.16 d4[0], r12
|
||||
vmov.16 d8[0], r1
|
||||
|
||||
vld1.16 {d0}, [r2]
|
||||
vld1.16 {d0}, [r4]
|
||||
|
||||
; q2: l0r0 q3: l8r8
|
||||
; q4: l4r4 q5: l12r12
|
||||
@ -176,26 +177,28 @@
|
||||
sub r0, r0, #32
|
||||
vst1.16 {q14, q15}, [r0] ; write over low input
|
||||
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r1, r2, #4 ; hi
|
||||
|
||||
;saturate and narrow
|
||||
vqmovun.s16 d0, q4 ; lo
|
||||
vqmovun.s16 d1, q5
|
||||
vqmovun.s16 d2, q6 ; hi
|
||||
vqmovun.s16 d3, q7
|
||||
|
||||
ldr r1, [sp] ; stride
|
||||
add r2, r3, #4 ; hi
|
||||
vst1.32 {d0[0]}, [r3], r1 ; lo
|
||||
vst1.32 {d0[1]}, [r2], r1 ; hi
|
||||
vst1.32 {d1[0]}, [r3], r1
|
||||
vst1.32 {d1[1]}, [r2], r1
|
||||
vst1.32 {d2[0]}, [r3], r1
|
||||
vst1.32 {d2[1]}, [r2], r1
|
||||
vst1.32 {d3[0]}, [r3]
|
||||
vst1.32 {d3[1]}, [r2]
|
||||
vst1.32 {d0[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d0[1]}, [r1], r3 ; hi
|
||||
vst1.32 {d1[0]}, [r2], r3
|
||||
vst1.32 {d1[1]}, [r1], r3
|
||||
vst1.32 {d2[0]}, [r2], r3
|
||||
vst1.32 {d2[1]}, [r1], r3
|
||||
vst1.32 {d3[0]}, [r2]
|
||||
vst1.32 {d3[1]}, [r1]
|
||||
|
||||
bx lr
|
||||
pop {r4}
|
||||
bx lr
|
||||
|
||||
ENDP ; |idct_dequant_dc_full_2x_neon|
|
||||
ENDP ; |idct_dequant_dc_full_2x_neon|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x4e7b
|
||||
|
@ -15,32 +15,30 @@
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
;void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *pre,
|
||||
; unsigned char *dst, int pitch, int stride);
|
||||
;void idct_dequant_full_2x_neon(short *q, short *dq,
|
||||
; unsigned char *dst, int stride);
|
||||
; r0 *q,
|
||||
; r1 *dq,
|
||||
; r2 *pre
|
||||
; r3 *dst
|
||||
; sp pitch
|
||||
; sp+4 stride
|
||||
; r2 *dst
|
||||
; r3 stride
|
||||
|idct_dequant_full_2x_neon| PROC
|
||||
vld1.16 {q0, q1}, [r1] ; dq (same l/r)
|
||||
vld1.16 {q2, q3}, [r0] ; l q
|
||||
ldr r1, [sp] ; pitch
|
||||
add r0, r0, #32
|
||||
vld1.16 {q4, q5}, [r0] ; r q
|
||||
add r12, r2, #4
|
||||
|
||||
; interleave the predictors
|
||||
vld1.32 {d28[0]}, [r2], r1 ; l pre
|
||||
vld1.32 {d28[1]}, [r12], r1 ; r pre
|
||||
vld1.32 {d29[0]}, [r2], r1
|
||||
vld1.32 {d29[1]}, [r12], r1
|
||||
vld1.32 {d30[0]}, [r2], r1
|
||||
vld1.32 {d30[1]}, [r12], r1
|
||||
vld1.32 {d31[0]}, [r2]
|
||||
vld1.32 {d28[0]}, [r2], r3 ; l pre
|
||||
vld1.32 {d28[1]}, [r12], r3 ; r pre
|
||||
vld1.32 {d29[0]}, [r2], r3
|
||||
vld1.32 {d29[1]}, [r12], r3
|
||||
vld1.32 {d30[0]}, [r2], r3
|
||||
vld1.32 {d30[1]}, [r12], r3
|
||||
vld1.32 {d31[0]}, [r2], r3
|
||||
vld1.32 {d31[1]}, [r12]
|
||||
|
||||
adr r2, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
adr r1, cospi8sqrt2minus1 ; pointer to the first constant
|
||||
|
||||
; dequant: q[i] = q[i] * dq[i]
|
||||
vmul.i16 q2, q2, q0
|
||||
@ -48,7 +46,7 @@
|
||||
vmul.i16 q4, q4, q0
|
||||
vmul.i16 q5, q5, q1
|
||||
|
||||
vld1.16 {d0}, [r2]
|
||||
vld1.16 {d0}, [r1]
|
||||
|
||||
; q2: l0r0 q3: l8r8
|
||||
; q4: l4r4 q5: l12r12
|
||||
@ -168,22 +166,23 @@
|
||||
sub r0, r0, #32
|
||||
vst1.16 {q14, q15}, [r0] ; write over low input
|
||||
|
||||
sub r2, r2, r3, lsl #2 ; dst - 4*stride
|
||||
add r1, r2, #4 ; hi
|
||||
|
||||
;saturate and narrow
|
||||
vqmovun.s16 d0, q4 ; lo
|
||||
vqmovun.s16 d1, q5
|
||||
vqmovun.s16 d2, q6 ; hi
|
||||
vqmovun.s16 d3, q7
|
||||
|
||||
ldr r1, [sp, #4] ; stride
|
||||
add r2, r3, #4 ; hi
|
||||
vst1.32 {d0[0]}, [r3], r1 ; lo
|
||||
vst1.32 {d0[1]}, [r2], r1 ; hi
|
||||
vst1.32 {d1[0]}, [r3], r1
|
||||
vst1.32 {d1[1]}, [r2], r1
|
||||
vst1.32 {d2[0]}, [r3], r1
|
||||
vst1.32 {d2[1]}, [r2], r1
|
||||
vst1.32 {d3[0]}, [r3]
|
||||
vst1.32 {d3[1]}, [r2]
|
||||
vst1.32 {d0[0]}, [r2], r3 ; lo
|
||||
vst1.32 {d0[1]}, [r1], r3 ; hi
|
||||
vst1.32 {d1[0]}, [r2], r3
|
||||
vst1.32 {d1[1]}, [r1], r3
|
||||
vst1.32 {d2[0]}, [r2], r3
|
||||
vst1.32 {d2[1]}, [r1], r3
|
||||
vst1.32 {d3[0]}, [r2]
|
||||
vst1.32 {d3[1]}, [r1]
|
||||
|
||||
bx lr
|
||||
|
||||
|
@ -167,12 +167,12 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
|
||||
/* do prediction */
|
||||
if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
|
||||
{
|
||||
RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd);
|
||||
RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd);
|
||||
|
||||
if (mode != B_PRED)
|
||||
{
|
||||
RECON_INVOKE(&pbi->common.rtcd.recon,
|
||||
build_intra_predictors_mby)(xd);
|
||||
build_intra_predictors_mby_s)(xd);
|
||||
} else {
|
||||
vp8_intra_prediction_down_copy(xd);
|
||||
}
|
||||
@ -211,20 +211,24 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
|
||||
int b_mode = xd->mode_info_context->bmi[i].as_mode;
|
||||
|
||||
RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
|
||||
(b, b_mode, b->predictor);
|
||||
(b, b_mode, *(b->base_dst) + b->dst, b->dst_stride);
|
||||
|
||||
if (xd->eobs[i] > 1)
|
||||
if (xd->eobs[i] )
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, idct_add)
|
||||
(b->qcoeff, b->dequant, b->predictor,
|
||||
*(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
|
||||
(b->qcoeff[0] * b->dequant[0], b->predictor,
|
||||
*(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
if (xd->eobs[i] > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, idct_add)
|
||||
(b->qcoeff, b->dequant,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
|
||||
(b->qcoeff[0] * b->dequant[0],
|
||||
*(b->base_dst) + b->dst, b->dst_stride,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -233,18 +237,18 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
|
||||
{
|
||||
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
|
||||
(xd->qcoeff, xd->block[0].dequant,
|
||||
xd->predictor, xd->dst.y_buffer,
|
||||
xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs);
|
||||
}
|
||||
else
|
||||
{
|
||||
BLOCKD *b = &xd->block[24];
|
||||
|
||||
DEQUANT_INVOKE(&pbi->dequant, block)(b);
|
||||
|
||||
/* do 2nd order transform on the dc block */
|
||||
if (xd->eobs[24] > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, block)(b);
|
||||
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
((int *)b->qcoeff)[1] = 0;
|
||||
@ -257,19 +261,20 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
|
||||
}
|
||||
else
|
||||
{
|
||||
b->dqcoeff[0] = b->qcoeff[0] * b->dequant[0];
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
|
||||
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
|
||||
(xd->qcoeff, xd->block[0].dequant,
|
||||
xd->predictor, xd->dst.y_buffer,
|
||||
xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
|
||||
}
|
||||
|
||||
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
|
||||
(xd->qcoeff+16*16, xd->block[16].dequant,
|
||||
xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
|
||||
xd->dst.u_buffer, xd->dst.v_buffer,
|
||||
xd->dst.uv_stride, xd->eobs+16);
|
||||
}
|
||||
|
||||
|
@ -14,10 +14,6 @@
|
||||
#include "vp8/common/idct.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) ;
|
||||
extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
|
||||
|
||||
|
||||
void vp8_dequantize_b_c(BLOCKD *d)
|
||||
{
|
||||
int i;
|
||||
@ -31,12 +27,9 @@ void vp8_dequantize_b_c(BLOCKD *d)
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
|
||||
unsigned char *dest, int pitch, int stride)
|
||||
void vp8_dequant_idct_add_c(short *input, short *dq,
|
||||
unsigned char *dest, int stride)
|
||||
{
|
||||
short output[16];
|
||||
short *diff_ptr = output;
|
||||
int r, c;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
@ -44,40 +37,17 @@ void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
|
||||
input[i] = dq[i] * input[i];
|
||||
}
|
||||
|
||||
/* the idct halves ( >> 1) the pitch */
|
||||
vp8_short_idct4x4llm_c(input, output, 4 << 1);
|
||||
vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
|
||||
|
||||
vpx_memset(input, 0, 32);
|
||||
|
||||
for (r = 0; r < 4; r++)
|
||||
{
|
||||
for (c = 0; c < 4; c++)
|
||||
{
|
||||
int a = diff_ptr[c] + pred[c];
|
||||
|
||||
if (a < 0)
|
||||
a = 0;
|
||||
|
||||
if (a > 255)
|
||||
a = 255;
|
||||
|
||||
dest[c] = (unsigned char) a;
|
||||
}
|
||||
|
||||
dest += stride;
|
||||
diff_ptr += 4;
|
||||
pred += pitch;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
|
||||
unsigned char *dest, int pitch, int stride,
|
||||
void vp8_dequant_dc_idct_add_c(short *input, short *dq,
|
||||
unsigned char *dest, int stride,
|
||||
int Dc)
|
||||
{
|
||||
int i;
|
||||
short output[16];
|
||||
short *diff_ptr = output;
|
||||
int r, c;
|
||||
|
||||
input[0] = (short)Dc;
|
||||
|
||||
@ -86,28 +56,8 @@ void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
|
||||
input[i] = dq[i] * input[i];
|
||||
}
|
||||
|
||||
/* the idct halves ( >> 1) the pitch */
|
||||
vp8_short_idct4x4llm_c(input, output, 4 << 1);
|
||||
vp8_short_idct4x4llm_c(input, dest, stride, dest, stride);
|
||||
|
||||
vpx_memset(input, 0, 32);
|
||||
|
||||
for (r = 0; r < 4; r++)
|
||||
{
|
||||
for (c = 0; c < 4; c++)
|
||||
{
|
||||
int a = diff_ptr[c] + pred[c];
|
||||
|
||||
if (a < 0)
|
||||
a = 0;
|
||||
|
||||
if (a > 255)
|
||||
a = 255;
|
||||
|
||||
dest[c] = (unsigned char) a;
|
||||
}
|
||||
|
||||
dest += stride;
|
||||
diff_ptr += 4;
|
||||
pred += pitch;
|
||||
}
|
||||
}
|
||||
|
@ -18,28 +18,28 @@
|
||||
|
||||
#define prototype_dequant_idct_add(sym) \
|
||||
void sym(short *input, short *dq, \
|
||||
unsigned char *pred, unsigned char *output, \
|
||||
int pitch, int stride)
|
||||
unsigned char *output, \
|
||||
int stride)
|
||||
|
||||
#define prototype_dequant_dc_idct_add(sym) \
|
||||
void sym(short *input, short *dq, \
|
||||
unsigned char *pred, unsigned char *output, \
|
||||
int pitch, int stride, \
|
||||
unsigned char *dst, \
|
||||
int stride, \
|
||||
int dc)
|
||||
|
||||
#define prototype_dequant_dc_idct_add_y_block(sym) \
|
||||
void sym(short *q, short *dq, \
|
||||
unsigned char *pre, unsigned char *dst, \
|
||||
unsigned char *dst, \
|
||||
int stride, char *eobs, short *dc)
|
||||
|
||||
#define prototype_dequant_idct_add_y_block(sym) \
|
||||
void sym(short *q, short *dq, \
|
||||
unsigned char *pre, unsigned char *dst, \
|
||||
unsigned char *dst, \
|
||||
int stride, char *eobs)
|
||||
|
||||
#define prototype_dequant_idct_add_uv_block(sym) \
|
||||
void sym(short *q, short *dq, \
|
||||
unsigned char *pre, unsigned char *dst_u, \
|
||||
unsigned char *dst_u, \
|
||||
unsigned char *dst_v, int stride, char *eobs)
|
||||
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
|
@ -621,9 +621,8 @@ void vp8_conceal_corrupt_mb(MACROBLOCKD *xd)
|
||||
{
|
||||
/* This macroblock has corrupt residual, use the motion compensated
|
||||
image (predictor) for concealment */
|
||||
vp8_recon_copy16x16(xd->predictor, 16, xd->dst.y_buffer, xd->dst.y_stride);
|
||||
vp8_recon_copy8x8(xd->predictor + 256, 8,
|
||||
xd->dst.u_buffer, xd->dst.uv_stride);
|
||||
vp8_recon_copy8x8(xd->predictor + 320, 8,
|
||||
xd->dst.v_buffer, xd->dst.uv_stride);
|
||||
|
||||
/* The build predictor functions now output directly into the dst buffer,
|
||||
* so the copies are no longer necessary */
|
||||
|
||||
}
|
||||
|
@ -12,16 +12,17 @@
|
||||
#include "vp8/common/idct.h"
|
||||
#include "dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_c(short *input, short *dq, unsigned char *pred,
|
||||
unsigned char *dest, int pitch, int stride,
|
||||
void vp8_dequant_dc_idct_add_c(short *input, short *dq,
|
||||
unsigned char *dest, int stride,
|
||||
int Dc);
|
||||
void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *pred,
|
||||
unsigned char *dest, int pitch, int stride);
|
||||
void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
|
||||
unsigned char *dst_ptr, int pitch, int stride);
|
||||
void vp8_dequant_idct_add_c(short *input, short *dq,
|
||||
unsigned char *dest, int stride);
|
||||
void vp8_dc_only_idct_add_c(short input_dc, unsigned char * pred,
|
||||
int pred_stride, unsigned char *dst_ptr,
|
||||
int dst_stride);
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_c
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i, j;
|
||||
@ -31,23 +32,21 @@ void vp8_dequant_dc_idct_add_y_block_c
|
||||
for (j = 0; j < 4; j++)
|
||||
{
|
||||
if (*eobs++ > 1)
|
||||
vp8_dequant_dc_idct_add_c (q, dq, pre, dst, 16, stride, dc[0]);
|
||||
vp8_dequant_dc_idct_add_c (q, dq, dst, stride, dc[0]);
|
||||
else
|
||||
vp8_dc_only_idct_add_c (dc[0], pre, dst, 16, stride);
|
||||
vp8_dc_only_idct_add_c (dc[0], dst, stride, dst, stride);
|
||||
|
||||
q += 16;
|
||||
pre += 4;
|
||||
dst += 4;
|
||||
dc ++;
|
||||
}
|
||||
|
||||
pre += 64 - 16;
|
||||
dst += 4*stride - 16;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_c
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
{
|
||||
int i, j;
|
||||
@ -57,25 +56,23 @@ void vp8_dequant_idct_add_y_block_c
|
||||
for (j = 0; j < 4; j++)
|
||||
{
|
||||
if (*eobs++ > 1)
|
||||
vp8_dequant_idct_add_c (q, dq, pre, dst, 16, stride);
|
||||
vp8_dequant_idct_add_c (q, dq, dst, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dst, 16, stride);
|
||||
vp8_dc_only_idct_add_c (q[0]*dq[0], dst, stride, dst, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
q += 16;
|
||||
pre += 4;
|
||||
dst += 4;
|
||||
}
|
||||
|
||||
pre += 64 - 16;
|
||||
dst += 4*stride - 16;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_c
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
{
|
||||
int i, j;
|
||||
@ -85,19 +82,17 @@ void vp8_dequant_idct_add_uv_block_c
|
||||
for (j = 0; j < 2; j++)
|
||||
{
|
||||
if (*eobs++ > 1)
|
||||
vp8_dequant_idct_add_c (q, dq, pre, dstu, 8, stride);
|
||||
vp8_dequant_idct_add_c (q, dq, dstu, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstu, 8, stride);
|
||||
vp8_dc_only_idct_add_c (q[0]*dq[0], dstu, stride, dstu, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
q += 16;
|
||||
pre += 4;
|
||||
dstu += 4;
|
||||
}
|
||||
|
||||
pre += 32 - 8;
|
||||
dstu += 4*stride - 8;
|
||||
}
|
||||
|
||||
@ -106,19 +101,17 @@ void vp8_dequant_idct_add_uv_block_c
|
||||
for (j = 0; j < 2; j++)
|
||||
{
|
||||
if (*eobs++ > 1)
|
||||
vp8_dequant_idct_add_c (q, dq, pre, dstv, 8, stride);
|
||||
vp8_dequant_idct_add_c (q, dq, dstv, stride);
|
||||
else
|
||||
{
|
||||
vp8_dc_only_idct_add_c (q[0]*dq[0], pre, dstv, 8, stride);
|
||||
vp8_dc_only_idct_add_c (q[0]*dq[0], dstv, stride, dstv, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
q += 16;
|
||||
pre += 4;
|
||||
dstv += 4;
|
||||
}
|
||||
|
||||
pre += 32 - 8;
|
||||
dstv += 4*stride - 8;
|
||||
}
|
||||
}
|
||||
|
@ -606,6 +606,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
MACROBLOCKD *xd,
|
||||
int b_mode,
|
||||
unsigned char *predictor,
|
||||
int stride,
|
||||
int mb_row,
|
||||
int mb_col,
|
||||
int num)
|
||||
@ -662,7 +663,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
predictor[c] = expected_dc;
|
||||
}
|
||||
|
||||
predictor += 16;
|
||||
predictor += stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -684,7 +685,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
predictor[c] = pred;
|
||||
}
|
||||
|
||||
predictor += 16;
|
||||
predictor += stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -706,7 +707,7 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
predictor[c] = ap[c];
|
||||
}
|
||||
|
||||
predictor += 16;
|
||||
predictor += stride;
|
||||
}
|
||||
|
||||
}
|
||||
@ -729,29 +730,29 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
predictor[c] = lp[r];
|
||||
}
|
||||
|
||||
predictor += 16;
|
||||
predictor += stride;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case B_LD_PRED:
|
||||
{
|
||||
unsigned char *ptr = Above;
|
||||
predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
|
||||
predictor[0 * 16 + 1] =
|
||||
predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
|
||||
predictor[0 * 16 + 2] =
|
||||
predictor[1 * 16 + 1] =
|
||||
predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] =
|
||||
predictor[1 * 16 + 2] =
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
|
||||
predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
|
||||
predictor[0 * stride + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
|
||||
predictor[0 * stride + 1] =
|
||||
predictor[1 * stride + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
|
||||
predictor[0 * stride + 2] =
|
||||
predictor[1 * stride + 1] =
|
||||
predictor[2 * stride + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
|
||||
predictor[0 * stride + 3] =
|
||||
predictor[1 * stride + 2] =
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[3 * stride + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[3 * stride + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[3 * stride + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
|
||||
predictor[3 * stride + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
|
||||
|
||||
}
|
||||
break;
|
||||
@ -770,22 +771,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
pp[7] = Above[2];
|
||||
pp[8] = Above[3];
|
||||
|
||||
predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[3 * 16 + 1] =
|
||||
predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[3 * 16 + 2] =
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[3 * 16 + 3] =
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[1 * 16 + 1] =
|
||||
predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[1 * 16 + 2] =
|
||||
predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
|
||||
predictor[3 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[3 * stride + 1] =
|
||||
predictor[2 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[3 * stride + 2] =
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[1 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[3 * stride + 3] =
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[1 * stride + 1] =
|
||||
predictor[0 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[1 * stride + 2] =
|
||||
predictor[0 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[0 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[0 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
|
||||
|
||||
}
|
||||
break;
|
||||
@ -805,22 +806,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
pp[8] = Above[3];
|
||||
|
||||
|
||||
predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[3 * 16 + 1] =
|
||||
predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
|
||||
predictor[3 * 16 + 2] =
|
||||
predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
|
||||
predictor[3 * 16 + 3] =
|
||||
predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
|
||||
predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
|
||||
predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * stride + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[3 * stride + 1] =
|
||||
predictor[1 * stride + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[0 * stride + 0] = (pp[4] + pp[5] + 1) >> 1;
|
||||
predictor[3 * stride + 2] =
|
||||
predictor[1 * stride + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[0 * stride + 1] = (pp[5] + pp[6] + 1) >> 1;
|
||||
predictor[3 * stride + 3] =
|
||||
predictor[1 * stride + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[0 * stride + 2] = (pp[6] + pp[7] + 1) >> 1;
|
||||
predictor[1 * stride + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
|
||||
predictor[0 * stride + 3] = (pp[7] + pp[8] + 1) >> 1;
|
||||
|
||||
}
|
||||
break;
|
||||
@ -829,22 +830,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
|
||||
unsigned char *pp = Above;
|
||||
|
||||
predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[2 * 16 + 0] =
|
||||
predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[1 * 16 + 1] =
|
||||
predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[3 * 16 + 1] =
|
||||
predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] =
|
||||
predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[1 * stride + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[2 * stride + 0] =
|
||||
predictor[0 * stride + 1] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[1 * stride + 1] =
|
||||
predictor[3 * stride + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[0 * stride + 2] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[3 * stride + 1] =
|
||||
predictor[1 * stride + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[0 * stride + 3] =
|
||||
predictor[2 * stride + 2] = (pp[3] + pp[4] + 1) >> 1;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[3 * stride + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[2 * stride + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[3 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
}
|
||||
break;
|
||||
|
||||
@ -862,22 +863,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
pp[8] = Above[3];
|
||||
|
||||
|
||||
predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[2 * 16 + 0] =
|
||||
predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[2 * 16 + 1] =
|
||||
predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[1 * 16 + 2] =
|
||||
predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
predictor[3 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[3 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[2 * stride + 0] =
|
||||
predictor[3 * stride + 2] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[2 * stride + 1] =
|
||||
predictor[3 * stride + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[1 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[1 * stride + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
|
||||
predictor[1 * stride + 2] =
|
||||
predictor[0 * stride + 0] = (pp[3] + pp[4] + 1) >> 1;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[0 * stride + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
|
||||
predictor[0 * stride + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
|
||||
predictor[0 * stride + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
|
||||
}
|
||||
break;
|
||||
|
||||
@ -885,22 +886,22 @@ void vp8mt_predict_intra4x4(VP8D_COMP *pbi,
|
||||
case B_HU_PRED:
|
||||
{
|
||||
unsigned char *pp = Left;
|
||||
predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[0 * 16 + 2] =
|
||||
predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[0 * 16 + 3] =
|
||||
predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[1 * 16 + 2] =
|
||||
predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[1 * 16 + 3] =
|
||||
predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * 16 + 2] =
|
||||
predictor[2 * 16 + 3] =
|
||||
predictor[3 * 16 + 0] =
|
||||
predictor[3 * 16 + 1] =
|
||||
predictor[3 * 16 + 2] =
|
||||
predictor[3 * 16 + 3] = pp[3];
|
||||
predictor[0 * stride + 0] = (pp[0] + pp[1] + 1) >> 1;
|
||||
predictor[0 * stride + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
|
||||
predictor[0 * stride + 2] =
|
||||
predictor[1 * stride + 0] = (pp[1] + pp[2] + 1) >> 1;
|
||||
predictor[0 * stride + 3] =
|
||||
predictor[1 * stride + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[1 * stride + 2] =
|
||||
predictor[2 * stride + 0] = (pp[2] + pp[3] + 1) >> 1;
|
||||
predictor[1 * stride + 3] =
|
||||
predictor[2 * stride + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
|
||||
predictor[2 * stride + 2] =
|
||||
predictor[2 * stride + 3] =
|
||||
predictor[3 * stride + 0] =
|
||||
predictor[3 * stride + 1] =
|
||||
predictor[3 * stride + 2] =
|
||||
predictor[3 * stride + 3] = pp[3];
|
||||
}
|
||||
break;
|
||||
|
||||
|
@ -19,7 +19,7 @@ extern void vp8mt_build_intra_predictors_mby_s(VP8D_COMP *pbi, MACROBLOCKD *x, i
|
||||
extern void vp8mt_build_intra_predictors_mbuv(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
|
||||
extern void vp8mt_build_intra_predictors_mbuv_s(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
|
||||
|
||||
extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int mb_row, int mb_col, int num);
|
||||
extern void vp8mt_predict_intra4x4(VP8D_COMP *pbi, MACROBLOCKD *x, int b_mode, unsigned char *predictor, int stride, int mb_row, int mb_col, int num);
|
||||
extern void vp8mt_intra_prediction_down_copy(VP8D_COMP *pbi, MACROBLOCKD *x, int mb_row, int mb_col);
|
||||
#endif
|
||||
|
||||
|
@ -138,11 +138,11 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
|
||||
/* do prediction */
|
||||
if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME)
|
||||
{
|
||||
vp8mt_build_intra_predictors_mbuv(pbi, xd, mb_row, mb_col);
|
||||
vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col);
|
||||
|
||||
if (xd->mode_info_context->mbmi.mode != B_PRED)
|
||||
{
|
||||
vp8mt_build_intra_predictors_mby(pbi, xd, mb_row, mb_col);
|
||||
vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col);
|
||||
} else {
|
||||
vp8mt_intra_prediction_down_copy(pbi, xd, mb_row, mb_col);
|
||||
}
|
||||
@ -201,7 +201,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
|
||||
|
||||
DEQUANT_INVOKE (&pbi->dequant, dc_idct_add_y_block)
|
||||
(xd->qcoeff, xd->block[0].dequant,
|
||||
xd->predictor, xd->dst.y_buffer,
|
||||
xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs, xd->block[24].diff);
|
||||
}
|
||||
else if (xd->mode_info_context->mbmi.mode == B_PRED)
|
||||
@ -211,19 +211,21 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
|
||||
BLOCKD *b = &xd->block[i];
|
||||
int b_mode = xd->mode_info_context->bmi[i].as_mode;
|
||||
|
||||
vp8mt_predict_intra4x4(pbi, xd, b_mode, b->predictor, mb_row, mb_col, i);
|
||||
vp8mt_predict_intra4x4(pbi, xd, b_mode, *(b->base_dst) + b->dst,
|
||||
b->dst_stride, mb_row, mb_col, i);
|
||||
|
||||
if (xd->eobs[i] > 1)
|
||||
{
|
||||
DEQUANT_INVOKE(&pbi->dequant, idct_add)
|
||||
(b->qcoeff, b->dequant, b->predictor,
|
||||
*(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
(b->qcoeff, b->dequant,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(RTCD_VTABLE(idct), idct1_scalar_add)
|
||||
(b->qcoeff[0] * b->dequant[0], b->predictor,
|
||||
*(b->base_dst) + b->dst, 16, b->dst_stride);
|
||||
(b->qcoeff[0] * b->dequant[0],
|
||||
*(b->base_dst) + b->dst, b->dst_stride,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
((int *)b->qcoeff)[0] = 0;
|
||||
}
|
||||
}
|
||||
@ -232,13 +234,13 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m
|
||||
{
|
||||
DEQUANT_INVOKE (&pbi->dequant, idct_add_y_block)
|
||||
(xd->qcoeff, xd->block[0].dequant,
|
||||
xd->predictor, xd->dst.y_buffer,
|
||||
xd->dst.y_buffer,
|
||||
xd->dst.y_stride, xd->eobs);
|
||||
}
|
||||
|
||||
DEQUANT_INVOKE (&pbi->dequant, idct_add_uv_block)
|
||||
(xd->qcoeff+16*16, xd->block[16].dequant,
|
||||
xd->predictor+16*16, xd->dst.u_buffer, xd->dst.v_buffer,
|
||||
xd->dst.u_buffer, xd->dst.v_buffer,
|
||||
xd->dst.uv_stride, xd->eobs+16);
|
||||
}
|
||||
|
||||
|
@ -50,14 +50,17 @@ sym(vp8_dequantize_b_impl_mmx):
|
||||
ret
|
||||
|
||||
|
||||
;void dequant_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride)
|
||||
;void dequant_idct_add_mmx(
|
||||
;short *input, 0
|
||||
;short *dq, 1
|
||||
;unsigned char *dest, 2
|
||||
;int stride) 3
|
||||
global sym(vp8_dequant_idct_add_mmx)
|
||||
sym(vp8_dequant_idct_add_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 6
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
@ -77,8 +80,8 @@ sym(vp8_dequant_idct_add_mmx):
|
||||
movq mm3, [rax+24]
|
||||
pmullw mm3, [rdx+24]
|
||||
|
||||
mov rdx, arg(3) ;dest
|
||||
mov rsi, arg(2) ;pred
|
||||
mov rdx, arg(2) ;dest
|
||||
|
||||
pxor mm7, mm7
|
||||
|
||||
|
||||
@ -89,8 +92,7 @@ sym(vp8_dequant_idct_add_mmx):
|
||||
movq [rax+24],mm7
|
||||
|
||||
|
||||
movsxd rax, dword ptr arg(4) ;pitch
|
||||
movsxd rdi, dword ptr arg(5) ;stride
|
||||
movsxd rdi, dword ptr arg(3) ;stride
|
||||
|
||||
psubw mm0, mm2 ; b1= 0-2
|
||||
paddw mm2, mm2 ;
|
||||
@ -211,28 +213,27 @@ sym(vp8_dequant_idct_add_mmx):
|
||||
|
||||
pxor mm7, mm7
|
||||
|
||||
movd mm4, [rsi]
|
||||
movd mm4, [rdx]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm0, mm4
|
||||
packuswb mm0, mm7
|
||||
movd [rdx], mm0
|
||||
|
||||
movd mm4, [rsi+rax]
|
||||
movd mm4, [rdx+rdi]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm1, mm4
|
||||
packuswb mm1, mm7
|
||||
movd [rdx+rdi], mm1
|
||||
|
||||
movd mm4, [rsi+2*rax]
|
||||
movd mm4, [rdx+2*rdi]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm2, mm4
|
||||
packuswb mm2, mm7
|
||||
movd [rdx+rdi*2], mm2
|
||||
|
||||
add rdx, rdi
|
||||
add rsi, rax
|
||||
|
||||
movd mm4, [rsi+2*rax]
|
||||
movd mm4, [rdx+2*rdi]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm5, mm4
|
||||
packuswb mm5, mm7
|
||||
@ -240,22 +241,24 @@ sym(vp8_dequant_idct_add_mmx):
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void dequant_dc_idct_add_mmx(short *input, short *dq, unsigned char *pred, unsigned char *dest, int pitch, int stride, int Dc)
|
||||
;void dequant_dc_idct_add_mmx(
|
||||
;short *input, 0
|
||||
;short *dq, 1
|
||||
;unsigned char *dest, 2
|
||||
;int stride, 3
|
||||
;int Dc) 4
|
||||
global sym(vp8_dequant_dc_idct_add_mmx)
|
||||
sym(vp8_dequant_dc_idct_add_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rax, arg(0) ;input
|
||||
@ -273,8 +276,7 @@ sym(vp8_dequant_dc_idct_add_mmx):
|
||||
movq mm3, [rax+24]
|
||||
pmullw mm3, [rdx+24]
|
||||
|
||||
mov rdx, arg(3) ;dest
|
||||
mov rsi, arg(2) ;pred
|
||||
mov rdx, arg(2) ;pred
|
||||
pxor mm7, mm7
|
||||
|
||||
|
||||
@ -286,13 +288,12 @@ sym(vp8_dequant_dc_idct_add_mmx):
|
||||
|
||||
; move lower word of Dc to lower word of mm0
|
||||
psrlq mm0, 16
|
||||
movzx rcx, word ptr arg(6) ;Dc
|
||||
movzx rcx, word ptr arg(4) ;Dc
|
||||
psllq mm0, 16
|
||||
movq mm7, rcx
|
||||
por mm0, mm7
|
||||
|
||||
movsxd rax, dword ptr arg(4) ;pitch
|
||||
movsxd rdi, dword ptr arg(5) ;stride
|
||||
movsxd rax, dword ptr arg(3) ;stride
|
||||
|
||||
psubw mm0, mm2 ; b1= 0-2
|
||||
paddw mm2, mm2 ;
|
||||
@ -413,36 +414,33 @@ sym(vp8_dequant_dc_idct_add_mmx):
|
||||
|
||||
pxor mm7, mm7
|
||||
|
||||
movd mm4, [rsi]
|
||||
movd mm4, [rdx]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm0, mm4
|
||||
packuswb mm0, mm7
|
||||
movd [rdx], mm0
|
||||
|
||||
movd mm4, [rsi+rax]
|
||||
movd mm4, [rdx+rax]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm1, mm4
|
||||
packuswb mm1, mm7
|
||||
movd [rdx+rdi], mm1
|
||||
movd [rdx+rax], mm1
|
||||
|
||||
movd mm4, [rsi+2*rax]
|
||||
movd mm4, [rdx+2*rax]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm2, mm4
|
||||
packuswb mm2, mm7
|
||||
movd [rdx+rdi*2], mm2
|
||||
movd [rdx+rax*2], mm2
|
||||
|
||||
add rdx, rdi
|
||||
add rsi, rax
|
||||
add rdx, rax
|
||||
|
||||
movd mm4, [rsi+2*rax]
|
||||
movd mm4, [rdx+2*rax]
|
||||
punpcklbw mm4, mm7
|
||||
paddsw mm5, mm4
|
||||
packuswb mm5, mm7
|
||||
movd [rdx+rdi*2], mm5
|
||||
movd [rdx+rax*2], mm5
|
||||
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
|
@ -13,7 +13,7 @@
|
||||
#include "vp8/decoder/dequantize.h"
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_mmx
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
@ -21,35 +21,34 @@ void vp8_dequant_dc_idct_add_y_block_mmx
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q, dq, pre, dst, 16, stride, dc[0]);
|
||||
else
|
||||
vp8_dc_only_idct_add_mmx (dc[0], pre, dst, 16, stride);
|
||||
vp8_dequant_dc_idct_add_mmx (q, dq, dst, stride, dc[0]);
|
||||
else if (eobs[0] == 1)
|
||||
vp8_dc_only_idct_add_mmx (dc[0], dst, stride, dst, stride);
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride, dc[1]);
|
||||
else
|
||||
vp8_dc_only_idct_add_mmx (dc[1], pre+4, dst+4, 16, stride);
|
||||
vp8_dequant_dc_idct_add_mmx (q+16, dq, dst+4, stride, dc[1]);
|
||||
else if (eobs[1] == 1)
|
||||
vp8_dc_only_idct_add_mmx (dc[1], dst+4, stride, dst+4, stride);
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride, dc[2]);
|
||||
else
|
||||
vp8_dc_only_idct_add_mmx (dc[2], pre+8, dst+8, 16, stride);
|
||||
vp8_dequant_dc_idct_add_mmx (q+32, dq, dst+8, stride, dc[2]);
|
||||
else if (eobs[2] == 1)
|
||||
vp8_dc_only_idct_add_mmx (dc[2], dst+8, stride, dst+8, stride);
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_dc_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride, dc[3]);
|
||||
else
|
||||
vp8_dc_only_idct_add_mmx (dc[3], pre+12, dst+12, 16, stride);
|
||||
vp8_dequant_dc_idct_add_mmx (q+48, dq, dst+12, stride, dc[3]);
|
||||
else if (eobs[3] == 1)
|
||||
vp8_dc_only_idct_add_mmx (dc[3], dst+12, stride, dst+12, stride);
|
||||
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_mmx
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
@ -57,46 +56,48 @@ void vp8_dequant_idct_add_y_block_mmx
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_mmx (q, dq, pre, dst, 16, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_mmx (q, dq, dst, stride);
|
||||
else if (eobs[0] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dst, 16, stride);
|
||||
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dst, stride, dst, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dst+4, 16, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_mmx (q+16, dq, dst+4, stride);
|
||||
else if (eobs[1] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dst+4, 16, stride);
|
||||
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dst+4, stride,
|
||||
dst+4, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+32, dq, pre+8, dst+8, 16, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_mmx (q+32, dq, dst+8, stride);
|
||||
else if (eobs[2] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[32]*dq[0], pre+8, dst+8, 16, stride);
|
||||
vp8_dc_only_idct_add_mmx (q[32]*dq[0], dst+8, stride,
|
||||
dst+8, stride);
|
||||
((int *)(q+32))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+48, dq, pre+12, dst+12, 16, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_mmx (q+48, dq, dst+12, stride);
|
||||
else if (eobs[3] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[48]*dq[0], pre+12, dst+12, 16, stride);
|
||||
vp8_dc_only_idct_add_mmx (q[48]*dq[0], dst+12, stride,
|
||||
dst+12, stride);
|
||||
((int *)(q+48))[0] = 0;
|
||||
}
|
||||
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += 4*stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_mmx
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
@ -104,23 +105,23 @@ void vp8_dequant_idct_add_uv_block_mmx
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_mmx (q, dq, pre, dstu, 8, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_mmx (q, dq, dstu, stride);
|
||||
else if (eobs[0] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstu, 8, stride);
|
||||
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstu, stride, dstu, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstu+4, 8, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_mmx (q+16, dq, dstu+4, stride);
|
||||
else if (eobs[1] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstu+4, 8, stride);
|
||||
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstu+4, stride,
|
||||
dstu+4, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
@ -128,23 +129,23 @@ void vp8_dequant_idct_add_uv_block_mmx
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_mmx (q, dq, pre, dstv, 8, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_mmx (q, dq, dstv, stride);
|
||||
else if (eobs[0] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[0]*dq[0], pre, dstv, 8, stride);
|
||||
vp8_dc_only_idct_add_mmx (q[0]*dq[0], dstv, stride, dstv, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_mmx (q+16, dq, pre+4, dstv+4, 8, stride);
|
||||
else
|
||||
vp8_dequant_idct_add_mmx (q+16, dq, dstv+4, stride);
|
||||
else if (eobs[1] == 1)
|
||||
{
|
||||
vp8_dc_only_idct_add_mmx (q[16]*dq[0], pre+4, dstv+4, 8, stride);
|
||||
vp8_dc_only_idct_add_mmx (q[16]*dq[0], dstv+4, stride,
|
||||
dstv+4, stride);
|
||||
((int *)(q+16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += 4*stride;
|
||||
eobs += 2;
|
||||
}
|
||||
|
@ -13,102 +13,115 @@
|
||||
#include "vp8/decoder/dequantize.h"
|
||||
|
||||
void vp8_idct_dequant_dc_0_2x_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int dst_stride, short *dc);
|
||||
void vp8_idct_dequant_dc_full_2x_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int dst_stride, short *dc);
|
||||
|
||||
void vp8_idct_dequant_0_2x_sse2
|
||||
(short *q, short *dq ,unsigned char *pre,
|
||||
unsigned char *dst, int dst_stride, int blk_stride);
|
||||
(short *q, short *dq ,
|
||||
unsigned char *dst, int dst_stride);
|
||||
void vp8_idct_dequant_full_2x_sse2
|
||||
(short *q, short *dq ,unsigned char *pre,
|
||||
unsigned char *dst, int dst_stride, int blk_stride);
|
||||
(short *q, short *dq ,
|
||||
unsigned char *dst, int dst_stride);
|
||||
|
||||
void vp8_dequant_dc_idct_add_y_block_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs, short *dc)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
vp8_idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
|
||||
else
|
||||
vp8_idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
|
||||
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
|
||||
else
|
||||
vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
|
||||
if (((short *)(eobs))[0])
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
vp8_idct_dequant_dc_full_2x_sse2 (q, dq, dst, stride, dc);
|
||||
else
|
||||
vp8_idct_dequant_dc_0_2x_sse2 (q, dq, dst, stride, dc);
|
||||
}
|
||||
|
||||
if (((short *)(eobs))[1])
|
||||
{
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
|
||||
else
|
||||
vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, dst+8, stride, dc+2);
|
||||
}
|
||||
q += 64;
|
||||
dc += 4;
|
||||
pre += 64;
|
||||
dst += stride*4;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_y_block_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dst, int stride, char *eobs)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
|
||||
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
|
||||
|
||||
if (((short *)(eobs))[0])
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, dst, stride);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, dst, stride);
|
||||
}
|
||||
if (((short *)(eobs))[1])
|
||||
{
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q+32, dq, dst+8, stride);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q+32, dq, dst+8, stride);
|
||||
}
|
||||
q += 64;
|
||||
pre += 64;
|
||||
dst += stride*4;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_sse2
|
||||
(short *q, short *dq, unsigned char *pre,
|
||||
(short *q, short *dq,
|
||||
unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
|
||||
|
||||
if (((short *)(eobs))[0])
|
||||
{
|
||||
if (((short *)(eobs))[0] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
|
||||
}
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstu += stride*4;
|
||||
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
|
||||
|
||||
if (((short *)(eobs))[1])
|
||||
{
|
||||
if (((short *)(eobs))[1] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, dstu, stride);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, dstu, stride);
|
||||
}
|
||||
q += 32;
|
||||
pre += 32;
|
||||
|
||||
if (((short *)(eobs))[2] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
|
||||
|
||||
if (((short *)(eobs))[2])
|
||||
{
|
||||
if (((short *)(eobs))[2] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
|
||||
}
|
||||
q += 32;
|
||||
pre += 32;
|
||||
dstv += stride*4;
|
||||
|
||||
if (((short *)(eobs))[3] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
|
||||
if (((short *)(eobs))[3])
|
||||
{
|
||||
if (((short *)(eobs))[3] & 0xfefe)
|
||||
vp8_idct_dequant_full_2x_sse2 (q, dq, dstv, stride);
|
||||
else
|
||||
vp8_idct_dequant_0_2x_sse2 (q, dq, dstv, stride);
|
||||
}
|
||||
}
|
||||
|
@ -64,7 +64,7 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
|
||||
BLOCK *be = &x->block[ib];
|
||||
|
||||
RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
|
||||
(b, b->bmi.as_mode, b->predictor);
|
||||
(b, b->bmi.as_mode, b->predictor, 16);
|
||||
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
|
||||
|
||||
@ -72,9 +72,8 @@ void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd,
|
||||
|
||||
x->quantize_b(be, b);
|
||||
|
||||
vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);
|
||||
vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 16);
|
||||
|
||||
RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
|
||||
void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
|
||||
@ -106,9 +105,6 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
|
||||
vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
|
||||
|
||||
RECON_INVOKE(&rtcd->common->recon, recon_mby)
|
||||
(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
|
||||
|
||||
}
|
||||
|
||||
void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
@ -126,5 +122,4 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
|
||||
vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
|
||||
|
||||
vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
|
||||
}
|
||||
|
@ -577,9 +577,70 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
|
||||
}
|
||||
}
|
||||
|
||||
static void recon_dcblock(MACROBLOCKD *x)
|
||||
{
|
||||
BLOCKD *b = &x->block[24];
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
x->block[i].dqcoeff[0] = b->diff[i];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void inverse_transform_mb(const vp8_idct_rtcd_vtable_t *rtcd,
|
||||
MACROBLOCKD *x)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (x->mode_info_context->mbmi.mode != B_PRED &&
|
||||
x->mode_info_context->mbmi.mode != SPLITMV)
|
||||
{
|
||||
/* do 2nd order transform on the dc block */
|
||||
|
||||
IDCT_INVOKE(rtcd, iwalsh16)(&x->block[24].dqcoeff[0], x->block[24].diff);
|
||||
recon_dcblock(x);
|
||||
}
|
||||
|
||||
for (i = 0; i < 16; i++)
|
||||
{
|
||||
BLOCKD *b = &x->block[i];
|
||||
|
||||
if (b->eob > 1)
|
||||
{
|
||||
IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, 16,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, 16,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for (i = 16; i < 24; i++)
|
||||
{
|
||||
BLOCKD *b = &x->block[i];
|
||||
|
||||
if (b->eob > 1)
|
||||
{
|
||||
IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->predictor, 8,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
else
|
||||
{
|
||||
IDCT_INVOKE(rtcd, idct1_scalar_add)(b->dqcoeff[0], b->predictor, 8,
|
||||
*(b->base_dst) + b->dst, b->dst_stride);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
{
|
||||
vp8_build_inter_predictors_mb(&x->e_mbd);
|
||||
vp8_build_inter_predictors_mb_e(&x->e_mbd);
|
||||
|
||||
vp8_subtract_mb(rtcd, x);
|
||||
|
||||
@ -590,10 +651,8 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
if (x->optimize)
|
||||
optimize_mb(x, rtcd);
|
||||
|
||||
vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
|
||||
inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
|
||||
|
||||
RECON_INVOKE(&rtcd->common->recon, recon_mb)
|
||||
(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
|
||||
}
|
||||
|
||||
|
||||
@ -612,6 +671,4 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
|
||||
vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
|
||||
|
||||
RECON_INVOKE(&rtcd->common->recon, recon_mby)
|
||||
(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
|
||||
}
|
||||
|
@ -157,7 +157,7 @@ static int pick_intra4x4block(
|
||||
|
||||
rate = mode_costs[mode];
|
||||
RECON_INVOKE(&rtcd->common->recon, intra4x4_predict)
|
||||
(b, mode, b->predictor);
|
||||
(b, mode, b->predictor, 16);
|
||||
distortion = get_prediction_error(be, b, &rtcd->variance);
|
||||
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
|
||||
|
||||
|
@ -631,7 +631,7 @@ static int rd_pick_intra4x4block(
|
||||
rate = bmode_costs[mode];
|
||||
|
||||
RECON_INVOKE(&cpi->rtcd.common->recon, intra4x4_predict)
|
||||
(b, mode, b->predictor);
|
||||
(b, mode, b->predictor, 16);
|
||||
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16);
|
||||
x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
|
||||
x->quantize_b(be, b);
|
||||
@ -660,8 +660,8 @@ static int rd_pick_intra4x4block(
|
||||
}
|
||||
b->bmi.as_mode = (B_PREDICTION_MODE)(*best_mode);
|
||||
|
||||
IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff, b->diff, 32);
|
||||
RECON_INVOKE(IF_RTCD(&cpi->rtcd.common->recon), recon)(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
|
||||
IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(best_dqcoeff,
|
||||
best_predictor, 16, *(b->base_dst) + b->dst, b->dst_stride);
|
||||
|
||||
return best_rd;
|
||||
}
|
||||
|
@ -64,7 +64,6 @@ VP8_COMMON_SRCS-yes += common/mbpitch.c
|
||||
VP8_COMMON_SRCS-yes += common/modecont.c
|
||||
VP8_COMMON_SRCS-yes += common/modecontext.c
|
||||
VP8_COMMON_SRCS-yes += common/quant_common.c
|
||||
VP8_COMMON_SRCS-yes += common/recon.c
|
||||
VP8_COMMON_SRCS-yes += common/reconinter.c
|
||||
VP8_COMMON_SRCS-yes += common/reconintra.c
|
||||
VP8_COMMON_SRCS-yes += common/reconintra4x4.c
|
||||
@ -125,7 +124,6 @@ VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/iwalsh_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/filter_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/idct_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/loopfilter_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/recon_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/simpleloopfilter_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV6) += common/arm/armv6/sixtappredict8x4_v6$(ASM)
|
||||
|
||||
@ -143,16 +141,10 @@ VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfilter_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/mbloopfilter_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon2b_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon4b_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/reconb_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_1_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/shortidct4x4llm_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict4x4_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x4_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict8x8_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/sixtappredict16x16_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon16x16mb_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/save_neon_reg$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_ARMV7) += common/arm/neon/recon_neon.c
|
||||
|
Loading…
x
Reference in New Issue
Block a user