Added predictor stride argument(s) to subtract functions
Patch set 2: 64 bit build fix Patch set 3: 64 bit crash fix [Tero] Patch set 4: Updated ARMv6 and NEON assembly. Added also minor NEON optimizations to subtract functions. Patch set 5: x86 stride bug fix Change-Id: I1fcca93e90c89b89ddc204e1c18f208682675c15
This commit is contained in:
parent
2a6daa72f0
commit
edd98b7310
@ -72,22 +72,23 @@ loop_block
|
||||
; r0 short *diff
|
||||
; r1 unsigned char *usrc
|
||||
; r2 unsigned char *vsrc
|
||||
; r3 unsigned char *pred
|
||||
; stack int stride
|
||||
; r3 int src_stride
|
||||
; sp unsigned char *upred
|
||||
; sp unsigned char *vpred
|
||||
; sp int pred_stride
|
||||
|vp8_subtract_mbuv_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
stmfd sp!, {r4-r11}
|
||||
|
||||
add r0, r0, #512 ; set *diff point to Cb
|
||||
add r3, r3, #256 ; set *pred point to Cb
|
||||
|
||||
mov r4, #8 ; loop count
|
||||
ldr r5, [sp, #40] ; stride
|
||||
ldr r5, [sp, #32] ; upred
|
||||
ldr r12, [sp, #40] ; pred_stride
|
||||
|
||||
; Subtract U block
|
||||
loop_u
|
||||
ldr r6, [r1] ; src (A)
|
||||
ldr r7, [r3], #4 ; pred (A)
|
||||
ldr r6, [r1] ; usrc (A)
|
||||
ldr r7, [r5] ; upred (A)
|
||||
|
||||
uxtb16 r8, r6 ; [s2 | s0] (A)
|
||||
uxtb16 r9, r7 ; [p2 | p0] (A)
|
||||
@ -97,8 +98,8 @@ loop_u
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (A)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (A)
|
||||
|
||||
ldr r10, [r1, #4] ; src (B)
|
||||
ldr r11, [r3], #4 ; pred (B)
|
||||
ldr r10, [r1, #4] ; usrc (B)
|
||||
ldr r11, [r5, #4] ; upred (B)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
|
||||
@ -114,7 +115,8 @@ loop_u
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (B)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (B)
|
||||
|
||||
add r1, r1, r5 ; update usrc pointer
|
||||
add r1, r1, r3 ; update usrc pointer
|
||||
add r5, r5, r12 ; update upred pointer
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
|
||||
@ -125,12 +127,13 @@ loop_u
|
||||
|
||||
bne loop_u
|
||||
|
||||
ldr r5, [sp, #36] ; vpred
|
||||
mov r4, #8 ; loop count
|
||||
|
||||
; Subtract V block
|
||||
loop_v
|
||||
ldr r6, [r2] ; src (A)
|
||||
ldr r7, [r3], #4 ; pred (A)
|
||||
ldr r6, [r2] ; vsrc (A)
|
||||
ldr r7, [r5] ; vpred (A)
|
||||
|
||||
uxtb16 r8, r6 ; [s2 | s0] (A)
|
||||
uxtb16 r9, r7 ; [p2 | p0] (A)
|
||||
@ -140,8 +143,8 @@ loop_v
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (A)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (A)
|
||||
|
||||
ldr r10, [r2, #4] ; src (B)
|
||||
ldr r11, [r3], #4 ; pred (B)
|
||||
ldr r10, [r2, #4] ; vsrc (B)
|
||||
ldr r11, [r5, #4] ; vpred (B)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
|
||||
@ -157,7 +160,8 @@ loop_v
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (B)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (B)
|
||||
|
||||
add r2, r2, r5 ; update vsrc pointer
|
||||
add r2, r2, r3 ; update vsrc pointer
|
||||
add r5, r5, r12 ; update vpred pointer
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
|
||||
@ -168,23 +172,25 @@ loop_v
|
||||
|
||||
bne loop_v
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
ldmfd sp!, {r4-r11}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|
||||
; r0 short *diff
|
||||
; r1 unsigned char *src
|
||||
; r2 unsigned char *pred
|
||||
; r3 int stride
|
||||
; r2 int src_stride
|
||||
; r3 unsigned char *pred
|
||||
; sp int pred_stride
|
||||
|vp8_subtract_mby_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4-r11}
|
||||
|
||||
ldr r12, [sp, #32] ; pred_stride
|
||||
mov r4, #16
|
||||
loop
|
||||
ldr r6, [r1] ; src (A)
|
||||
ldr r7, [r2], #4 ; pred (A)
|
||||
ldr r7, [r3] ; pred (A)
|
||||
|
||||
uxtb16 r8, r6 ; [s2 | s0] (A)
|
||||
uxtb16 r9, r7 ; [p2 | p0] (A)
|
||||
@ -195,7 +201,7 @@ loop
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (A)
|
||||
|
||||
ldr r10, [r1, #4] ; src (B)
|
||||
ldr r11, [r2], #4 ; pred (B)
|
||||
ldr r11, [r3, #4] ; pred (B)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (A)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (A)
|
||||
@ -212,7 +218,7 @@ loop
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (B)
|
||||
|
||||
ldr r10, [r1, #8] ; src (C)
|
||||
ldr r11, [r2], #4 ; pred (C)
|
||||
ldr r11, [r3, #8] ; pred (C)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (B)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (B)
|
||||
@ -229,10 +235,10 @@ loop
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (C)
|
||||
|
||||
ldr r10, [r1, #12] ; src (D)
|
||||
ldr r11, [r2], #4 ; pred (D)
|
||||
ldr r11, [r3, #12] ; pred (D)
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (C)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (C)
|
||||
|
||||
str r8, [r0], #4 ; diff (C)
|
||||
uxtb16 r8, r10 ; [s2 | s0] (D)
|
||||
@ -245,7 +251,8 @@ loop
|
||||
usub16 r6, r8, r9 ; [d2 | d0] (D)
|
||||
usub16 r7, r10, r11 ; [d3 | d1] (D)
|
||||
|
||||
add r1, r1, r3 ; update src pointer
|
||||
add r1, r1, r2 ; update src pointer
|
||||
add r3, r3, r12 ; update pred pointer
|
||||
|
||||
pkhbt r8, r6, r7, lsl #16 ; [d1 | d0] (D)
|
||||
pkhtb r9, r7, r6, asr #16 ; [d3 | d2] (D)
|
||||
@ -257,7 +264,7 @@ loop
|
||||
bne loop
|
||||
|
||||
ldmfd sp!, {r4-r11}
|
||||
mov pc, lr
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
|
@ -61,19 +61,24 @@
|
||||
|
||||
|
||||
;==========================================
|
||||
;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
|
||||
;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
|
||||
; unsigned char *pred, int pred_stride)
|
||||
|vp8_subtract_mby_neon| PROC
|
||||
push {r4-r7}
|
||||
mov r12, #4
|
||||
ldr r4, [sp, #16] ; pred_stride
|
||||
mov r6, #32 ; "diff" stride x2
|
||||
add r5, r0, #16 ; second diff pointer
|
||||
|
||||
subtract_mby_loop
|
||||
vld1.8 {q0}, [r1], r3 ;load src
|
||||
vld1.8 {q1}, [r2]! ;load pred
|
||||
vld1.8 {q2}, [r1], r3
|
||||
vld1.8 {q3}, [r2]!
|
||||
vld1.8 {q4}, [r1], r3
|
||||
vld1.8 {q5}, [r2]!
|
||||
vld1.8 {q6}, [r1], r3
|
||||
vld1.8 {q7}, [r2]!
|
||||
vld1.8 {q0}, [r1], r2 ;load src
|
||||
vld1.8 {q1}, [r3], r4 ;load pred
|
||||
vld1.8 {q2}, [r1], r2
|
||||
vld1.8 {q3}, [r3], r4
|
||||
vld1.8 {q4}, [r1], r2
|
||||
vld1.8 {q5}, [r3], r4
|
||||
vld1.8 {q6}, [r1], r2
|
||||
vld1.8 {q7}, [r3], r4
|
||||
|
||||
vsubl.u8 q8, d0, d2
|
||||
vsubl.u8 q9, d1, d3
|
||||
@ -84,46 +89,53 @@ subtract_mby_loop
|
||||
vsubl.u8 q14, d12, d14
|
||||
vsubl.u8 q15, d13, d15
|
||||
|
||||
vst1.16 {q8}, [r0]! ;store diff
|
||||
vst1.16 {q9}, [r0]!
|
||||
vst1.16 {q10}, [r0]!
|
||||
vst1.16 {q11}, [r0]!
|
||||
vst1.16 {q12}, [r0]!
|
||||
vst1.16 {q13}, [r0]!
|
||||
vst1.16 {q14}, [r0]!
|
||||
vst1.16 {q15}, [r0]!
|
||||
vst1.16 {q8}, [r0], r6 ;store diff
|
||||
vst1.16 {q9}, [r5], r6
|
||||
vst1.16 {q10}, [r0], r6
|
||||
vst1.16 {q11}, [r5], r6
|
||||
vst1.16 {q12}, [r0], r6
|
||||
vst1.16 {q13}, [r5], r6
|
||||
vst1.16 {q14}, [r0], r6
|
||||
vst1.16 {q15}, [r5], r6
|
||||
|
||||
subs r12, r12, #1
|
||||
bne subtract_mby_loop
|
||||
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
ENDP
|
||||
|
||||
;=================================
|
||||
;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
|
||||
;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
|
||||
; int src_stride, unsigned char *upred,
|
||||
; unsigned char *vpred, int pred_stride)
|
||||
|
||||
|vp8_subtract_mbuv_neon| PROC
|
||||
ldr r12, [sp]
|
||||
push {r4-r7}
|
||||
ldr r4, [sp, #16] ; upred
|
||||
ldr r5, [sp, #20] ; vpred
|
||||
ldr r6, [sp, #24] ; pred_stride
|
||||
add r0, r0, #512 ; short *udiff = diff + 256;
|
||||
mov r12, #32 ; "diff" stride x2
|
||||
add r7, r0, #16 ; second diff pointer
|
||||
|
||||
;u
|
||||
add r0, r0, #512 ; short *udiff = diff + 256;
|
||||
add r3, r3, #256 ; unsigned char *upred = pred + 256;
|
||||
|
||||
vld1.8 {d0}, [r1], r12 ;load src
|
||||
vld1.8 {d1}, [r3]! ;load pred
|
||||
vld1.8 {d2}, [r1], r12
|
||||
vld1.8 {d3}, [r3]!
|
||||
vld1.8 {d4}, [r1], r12
|
||||
vld1.8 {d5}, [r3]!
|
||||
vld1.8 {d6}, [r1], r12
|
||||
vld1.8 {d7}, [r3]!
|
||||
vld1.8 {d8}, [r1], r12
|
||||
vld1.8 {d9}, [r3]!
|
||||
vld1.8 {d10}, [r1], r12
|
||||
vld1.8 {d11}, [r3]!
|
||||
vld1.8 {d12}, [r1], r12
|
||||
vld1.8 {d13}, [r3]!
|
||||
vld1.8 {d14}, [r1], r12
|
||||
vld1.8 {d15}, [r3]!
|
||||
vld1.8 {d0}, [r1], r3 ;load usrc
|
||||
vld1.8 {d1}, [r4], r6 ;load upred
|
||||
vld1.8 {d2}, [r1], r3
|
||||
vld1.8 {d3}, [r4], r6
|
||||
vld1.8 {d4}, [r1], r3
|
||||
vld1.8 {d5}, [r4], r6
|
||||
vld1.8 {d6}, [r1], r3
|
||||
vld1.8 {d7}, [r4], r6
|
||||
vld1.8 {d8}, [r1], r3
|
||||
vld1.8 {d9}, [r4], r6
|
||||
vld1.8 {d10}, [r1], r3
|
||||
vld1.8 {d11}, [r4], r6
|
||||
vld1.8 {d12}, [r1], r3
|
||||
vld1.8 {d13}, [r4], r6
|
||||
vld1.8 {d14}, [r1], r3
|
||||
vld1.8 {d15}, [r4], r6
|
||||
|
||||
vsubl.u8 q8, d0, d1
|
||||
vsubl.u8 q9, d2, d3
|
||||
@ -134,32 +146,32 @@ subtract_mby_loop
|
||||
vsubl.u8 q14, d12, d13
|
||||
vsubl.u8 q15, d14, d15
|
||||
|
||||
vst1.16 {q8}, [r0]! ;store diff
|
||||
vst1.16 {q9}, [r0]!
|
||||
vst1.16 {q10}, [r0]!
|
||||
vst1.16 {q11}, [r0]!
|
||||
vst1.16 {q12}, [r0]!
|
||||
vst1.16 {q13}, [r0]!
|
||||
vst1.16 {q14}, [r0]!
|
||||
vst1.16 {q15}, [r0]!
|
||||
vst1.16 {q8}, [r0], r12 ;store diff
|
||||
vst1.16 {q9}, [r7], r12
|
||||
vst1.16 {q10}, [r0], r12
|
||||
vst1.16 {q11}, [r7], r12
|
||||
vst1.16 {q12}, [r0], r12
|
||||
vst1.16 {q13}, [r7], r12
|
||||
vst1.16 {q14}, [r0], r12
|
||||
vst1.16 {q15}, [r7], r12
|
||||
|
||||
;v
|
||||
vld1.8 {d0}, [r2], r12 ;load src
|
||||
vld1.8 {d1}, [r3]! ;load pred
|
||||
vld1.8 {d2}, [r2], r12
|
||||
vld1.8 {d3}, [r3]!
|
||||
vld1.8 {d4}, [r2], r12
|
||||
vld1.8 {d5}, [r3]!
|
||||
vld1.8 {d6}, [r2], r12
|
||||
vld1.8 {d7}, [r3]!
|
||||
vld1.8 {d8}, [r2], r12
|
||||
vld1.8 {d9}, [r3]!
|
||||
vld1.8 {d10}, [r2], r12
|
||||
vld1.8 {d11}, [r3]!
|
||||
vld1.8 {d12}, [r2], r12
|
||||
vld1.8 {d13}, [r3]!
|
||||
vld1.8 {d14}, [r2], r12
|
||||
vld1.8 {d15}, [r3]!
|
||||
vld1.8 {d0}, [r2], r3 ;load vsrc
|
||||
vld1.8 {d1}, [r5], r6 ;load vpred
|
||||
vld1.8 {d2}, [r2], r3
|
||||
vld1.8 {d3}, [r5], r6
|
||||
vld1.8 {d4}, [r2], r3
|
||||
vld1.8 {d5}, [r5], r6
|
||||
vld1.8 {d6}, [r2], r3
|
||||
vld1.8 {d7}, [r5], r6
|
||||
vld1.8 {d8}, [r2], r3
|
||||
vld1.8 {d9}, [r5], r6
|
||||
vld1.8 {d10}, [r2], r3
|
||||
vld1.8 {d11}, [r5], r6
|
||||
vld1.8 {d12}, [r2], r3
|
||||
vld1.8 {d13}, [r5], r6
|
||||
vld1.8 {d14}, [r2], r3
|
||||
vld1.8 {d15}, [r5], r6
|
||||
|
||||
vsubl.u8 q8, d0, d1
|
||||
vsubl.u8 q9, d2, d3
|
||||
@ -170,16 +182,18 @@ subtract_mby_loop
|
||||
vsubl.u8 q14, d12, d13
|
||||
vsubl.u8 q15, d14, d15
|
||||
|
||||
vst1.16 {q8}, [r0]! ;store diff
|
||||
vst1.16 {q9}, [r0]!
|
||||
vst1.16 {q10}, [r0]!
|
||||
vst1.16 {q11}, [r0]!
|
||||
vst1.16 {q12}, [r0]!
|
||||
vst1.16 {q13}, [r0]!
|
||||
vst1.16 {q14}, [r0]!
|
||||
vst1.16 {q15}, [r0]!
|
||||
vst1.16 {q8}, [r0], r12 ;store diff
|
||||
vst1.16 {q9}, [r7], r12
|
||||
vst1.16 {q10}, [r0], r12
|
||||
vst1.16 {q11}, [r7], r12
|
||||
vst1.16 {q12}, [r0], r12
|
||||
vst1.16 {q13}, [r7], r12
|
||||
vst1.16 {q14}, [r0], r12
|
||||
vst1.16 {q15}, [r7], r12
|
||||
|
||||
pop {r4-r7}
|
||||
bx lr
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
@ -100,7 +100,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
|
||||
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
|
||||
x->e_mbd.predictor, b->src_stride);
|
||||
b->src_stride, x->e_mbd.predictor, 16);
|
||||
|
||||
vp8_transform_intra_mby(x);
|
||||
|
||||
@ -115,7 +115,9 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
{
|
||||
RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
|
||||
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
|
||||
x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
|
||||
&x->e_mbd.predictor[320], 8);
|
||||
|
||||
vp8_transform_mbuv(x);
|
||||
|
||||
|
@ -48,12 +48,12 @@ void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
|
||||
void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
|
||||
int src_stride, unsigned char *upred,
|
||||
unsigned char *vpred, int pred_stride)
|
||||
{
|
||||
short *udiff = diff + 256;
|
||||
short *vdiff = diff + 320;
|
||||
unsigned char *upred = pred + 256;
|
||||
unsigned char *vpred = pred + 320;
|
||||
|
||||
int r, c;
|
||||
|
||||
@ -65,8 +65,8 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
|
||||
}
|
||||
|
||||
udiff += 8;
|
||||
upred += 8;
|
||||
usrc += stride;
|
||||
upred += pred_stride;
|
||||
usrc += src_stride;
|
||||
}
|
||||
|
||||
for (r = 0; r < 8; r++)
|
||||
@ -77,12 +77,13 @@ void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
|
||||
}
|
||||
|
||||
vdiff += 8;
|
||||
vpred += 8;
|
||||
vsrc += stride;
|
||||
vpred += pred_stride;
|
||||
vsrc += src_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
|
||||
void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
|
||||
unsigned char *pred, int pred_stride)
|
||||
{
|
||||
int r, c;
|
||||
|
||||
@ -94,8 +95,8 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, in
|
||||
}
|
||||
|
||||
diff += 16;
|
||||
pred += 16;
|
||||
src += stride;
|
||||
pred += pred_stride;
|
||||
src += src_stride;
|
||||
}
|
||||
}
|
||||
|
||||
@ -103,8 +104,11 @@ static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
{
|
||||
BLOCK *b = &x->block[0];
|
||||
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
|
||||
b->src_stride, x->e_mbd.predictor, 16);
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
|
||||
x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
|
||||
&x->e_mbd.predictor[320], 8);
|
||||
}
|
||||
|
||||
static void build_dcblock(MACROBLOCK *x)
|
||||
@ -641,7 +645,8 @@ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
|
||||
|
||||
vp8_build_inter16x16_predictors_mby(&x->e_mbd);
|
||||
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
|
||||
ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
|
||||
b->src_stride, x->e_mbd.predictor, 16);
|
||||
|
||||
transform_mby(x);
|
||||
|
||||
|
@ -28,11 +28,13 @@
|
||||
void (sym)(BLOCK *be,BLOCKD *bd, int pitch)
|
||||
|
||||
#define prototype_submby(sym) \
|
||||
void (sym)(short *diff, unsigned char *src, unsigned char *pred, int stride)
|
||||
void (sym)(short *diff, unsigned char *src, int src_stride, \
|
||||
unsigned char *pred, int pred_stride)
|
||||
|
||||
#define prototype_submbuv(sym) \
|
||||
void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\
|
||||
unsigned char *pred, int stride)
|
||||
int src_stride, unsigned char *upred, unsigned char *vpred,\
|
||||
int pred_stride)
|
||||
|
||||
#if ARCH_X86 || ARCH_X86_64
|
||||
#include "x86/encodemb_x86.h"
|
||||
|
@ -552,7 +552,7 @@ static void macro_block_yrd( MACROBLOCK *mb,
|
||||
int d;
|
||||
|
||||
ENCODEMB_INVOKE(rtcd, submby)( mb->src_diff, *(mb->block[0].base_src),
|
||||
mb->e_mbd.predictor, mb->block[0].src_stride );
|
||||
mb->block[0].src_stride, mb->e_mbd.predictor, 16);
|
||||
|
||||
// Fdct and building the 2nd order block
|
||||
for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
|
||||
@ -800,7 +800,8 @@ static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
|
||||
{
|
||||
vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
|
||||
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
|
||||
x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
|
||||
x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
|
||||
&x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
|
||||
|
||||
vp8_transform_mbuv(x);
|
||||
vp8_quantize_mbuv(x);
|
||||
@ -816,7 +817,8 @@ static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
|
||||
{
|
||||
vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
|
||||
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
|
||||
x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
|
||||
x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
|
||||
&x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
|
||||
|
||||
vp8_transform_mbuv(x);
|
||||
vp8_quantize_mbuv(x);
|
||||
@ -845,8 +847,8 @@ static void rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int
|
||||
RECON_INVOKE(&cpi->rtcd.common->recon, build_intra_predictors_mbuv)
|
||||
(&x->e_mbd);
|
||||
ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
|
||||
x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor,
|
||||
x->src.uv_stride);
|
||||
x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
|
||||
&x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
|
||||
vp8_transform_mbuv(x);
|
||||
vp8_quantize_mbuv(x);
|
||||
|
||||
|
@ -73,85 +73,10 @@ sym(vp8_subtract_b_mmx_impl):
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
|
||||
;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
|
||||
;unsigned char *pred, int pred_stride)
|
||||
global sym(vp8_subtract_mby_mmx)
|
||||
sym(vp8_subtract_mby_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
|
||||
mov rsi, arg(1) ;src
|
||||
mov rdi, arg(0) ;diff
|
||||
|
||||
mov rax, arg(2) ;pred
|
||||
movsxd rdx, dword ptr arg(3) ;stride
|
||||
|
||||
mov rcx, 16
|
||||
pxor mm0, mm0
|
||||
|
||||
.submby_loop:
|
||||
|
||||
movq mm1, [rsi]
|
||||
movq mm3, [rax]
|
||||
|
||||
movq mm2, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
punpcklbw mm1, mm0
|
||||
punpcklbw mm3, mm0
|
||||
|
||||
punpckhbw mm2, mm0
|
||||
punpckhbw mm4, mm0
|
||||
|
||||
psubw mm1, mm3
|
||||
psubw mm2, mm4
|
||||
|
||||
movq [rdi], mm1
|
||||
movq [rdi+8], mm2
|
||||
|
||||
|
||||
movq mm1, [rsi+8]
|
||||
movq mm3, [rax+8]
|
||||
|
||||
movq mm2, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
punpcklbw mm1, mm0
|
||||
punpcklbw mm3, mm0
|
||||
|
||||
punpckhbw mm2, mm0
|
||||
punpckhbw mm4, mm0
|
||||
|
||||
psubw mm1, mm3
|
||||
psubw mm2, mm4
|
||||
|
||||
movq [rdi+16], mm1
|
||||
movq [rdi+24], mm2
|
||||
|
||||
|
||||
add rdi, 32
|
||||
add rax, 16
|
||||
|
||||
lea rsi, [rsi+rdx]
|
||||
|
||||
sub rcx, 1
|
||||
jnz .submby_loop
|
||||
|
||||
pop rdi
|
||||
pop rsi
|
||||
; begin epilog
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
|
||||
global sym(vp8_subtract_mbuv_mmx)
|
||||
sym(vp8_subtract_mbuv_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
@ -159,271 +84,137 @@ sym(vp8_subtract_mbuv_mmx):
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
;short *udiff = diff + 256;
|
||||
;short *vdiff = diff + 320;
|
||||
;unsigned char *upred = pred + 256;
|
||||
;unsigned char *vpred = pred + 320;
|
||||
mov rdi, arg(0) ;diff
|
||||
mov rsi, arg(1) ;src
|
||||
movsxd rdx, dword ptr arg(2);src_stride
|
||||
mov rax, arg(3) ;pred
|
||||
push rbx
|
||||
movsxd rbx, dword ptr arg(4);pred_stride
|
||||
|
||||
;unsigned char *z = usrc;
|
||||
;unsigned short *diff = udiff;
|
||||
;unsigned char *Predictor= upred;
|
||||
|
||||
mov rdi, arg(0) ;diff
|
||||
mov rax, arg(3) ;pred
|
||||
mov rsi, arg(1) ;z = usrc
|
||||
add rdi, 256*2 ;diff = diff + 256 (shorts)
|
||||
add rax, 256 ;Predictor = pred + 256
|
||||
movsxd rdx, dword ptr arg(4) ;stride;
|
||||
pxor mm7, mm7
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rax]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
pxor mm0, mm0
|
||||
mov rcx, 16
|
||||
|
||||
|
||||
movq mm0, [rsi+rdx]
|
||||
movq mm1, [rax+8]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi+16], mm0
|
||||
movq [rdi+24], mm3
|
||||
.submby_loop:
|
||||
movq mm1, [rsi]
|
||||
movq mm3, [rax]
|
||||
|
||||
movq mm0, [rsi+rdx*2]
|
||||
movq mm1, [rax+16]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi+32], mm0
|
||||
movq [rdi+40], mm3
|
||||
lea rsi, [rsi+rdx*2]
|
||||
movq mm2, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
punpcklbw mm1, mm0
|
||||
punpcklbw mm3, mm0
|
||||
|
||||
punpckhbw mm2, mm0
|
||||
punpckhbw mm4, mm0
|
||||
|
||||
psubw mm1, mm3
|
||||
psubw mm2, mm4
|
||||
|
||||
movq [rdi], mm1
|
||||
movq [rdi+8], mm2
|
||||
|
||||
movq mm1, [rsi+8]
|
||||
movq mm3, [rax+8]
|
||||
|
||||
movq mm2, mm1
|
||||
movq mm4, mm3
|
||||
|
||||
punpcklbw mm1, mm0
|
||||
punpcklbw mm3, mm0
|
||||
|
||||
punpckhbw mm2, mm0
|
||||
punpckhbw mm4, mm0
|
||||
|
||||
psubw mm1, mm3
|
||||
psubw mm2, mm4
|
||||
|
||||
movq [rdi+16], mm1
|
||||
movq [rdi+24], mm2
|
||||
add rdi, 32
|
||||
lea rax, [rax+rbx]
|
||||
lea rsi, [rsi+rdx]
|
||||
dec rcx
|
||||
jnz .submby_loop
|
||||
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
; begin epilog
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
movq mm0, [rsi+rdx]
|
||||
movq mm1, [rax+24]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
|
||||
; int src_stride, unsigned char *upred,
|
||||
; unsigned char *vpred, int pred_stride)
|
||||
|
||||
movq [rdi+48], mm0
|
||||
movq [rdi+56], mm3
|
||||
global sym(vp8_subtract_mbuv_mmx)
|
||||
sym(vp8_subtract_mbuv_mmx):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rdi, arg(0) ;diff
|
||||
mov rsi, arg(1) ;usrc
|
||||
movsxd rdx, dword ptr arg(3);src_stride;
|
||||
mov rax, arg(4) ;upred
|
||||
add rdi, 256*2 ;diff = diff + 256 (shorts)
|
||||
mov rcx, 8
|
||||
push rbx
|
||||
movsxd rbx, dword ptr arg(6);pred_stride
|
||||
|
||||
add rdi, 64
|
||||
add rax, 32
|
||||
lea rsi, [rsi+rdx*2]
|
||||
pxor mm7, mm7
|
||||
|
||||
.submbu_loop:
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rax]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
add rdi, 16
|
||||
add rsi, rdx
|
||||
add rax, rbx
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rax]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
dec rcx
|
||||
jnz .submbu_loop
|
||||
|
||||
mov rsi, arg(2) ;vsrc
|
||||
mov rax, arg(5) ;vpred
|
||||
mov rcx, 8
|
||||
|
||||
movq mm0, [rsi+rdx]
|
||||
movq mm1, [rax+8]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi+16], mm0
|
||||
movq [rdi+24], mm3
|
||||
.submbv_loop:
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rax]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
add rdi, 16
|
||||
add rsi, rdx
|
||||
add rax, rbx
|
||||
|
||||
movq mm0, [rsi+rdx*2]
|
||||
movq mm1, [rax+16]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi+32], mm0
|
||||
movq [rdi+40], mm3
|
||||
lea rsi, [rsi+rdx*2]
|
||||
|
||||
|
||||
movq mm0, [rsi+rdx]
|
||||
movq mm1, [rax+24]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
|
||||
movq [rdi+48], mm0
|
||||
movq [rdi+56], mm3
|
||||
|
||||
;unsigned char *z = vsrc;
|
||||
;unsigned short *diff = vdiff;
|
||||
;unsigned char *Predictor= vpred;
|
||||
|
||||
mov rdi, arg(0) ;diff
|
||||
mov rax, arg(3) ;pred
|
||||
mov rsi, arg(2) ;z = usrc
|
||||
add rdi, 320*2 ;diff = diff + 320 (shorts)
|
||||
add rax, 320 ;Predictor = pred + 320
|
||||
movsxd rdx, dword ptr arg(4) ;stride;
|
||||
pxor mm7, mm7
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rax]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
|
||||
movq mm0, [rsi+rdx]
|
||||
movq mm1, [rax+8]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi+16], mm0
|
||||
movq [rdi+24], mm3
|
||||
|
||||
movq mm0, [rsi+rdx*2]
|
||||
movq mm1, [rax+16]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi+32], mm0
|
||||
movq [rdi+40], mm3
|
||||
lea rsi, [rsi+rdx*2]
|
||||
|
||||
|
||||
movq mm0, [rsi+rdx]
|
||||
movq mm1, [rax+24]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
|
||||
movq [rdi+48], mm0
|
||||
movq [rdi+56], mm3
|
||||
|
||||
|
||||
add rdi, 64
|
||||
add rax, 32
|
||||
lea rsi, [rsi+rdx*2]
|
||||
|
||||
|
||||
movq mm0, [rsi]
|
||||
movq mm1, [rax]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi], mm0
|
||||
movq [rdi+8], mm3
|
||||
|
||||
|
||||
movq mm0, [rsi+rdx]
|
||||
movq mm1, [rax+8]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi+16], mm0
|
||||
movq [rdi+24], mm3
|
||||
|
||||
movq mm0, [rsi+rdx*2]
|
||||
movq mm1, [rax+16]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
movq [rdi+32], mm0
|
||||
movq [rdi+40], mm3
|
||||
lea rsi, [rsi+rdx*2]
|
||||
|
||||
|
||||
movq mm0, [rsi+rdx]
|
||||
movq mm1, [rax+24]
|
||||
movq mm3, mm0
|
||||
movq mm4, mm1
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
punpckhbw mm3, mm7
|
||||
punpckhbw mm4, mm7
|
||||
psubw mm0, mm1
|
||||
psubw mm3, mm4
|
||||
|
||||
movq [rdi+48], mm0
|
||||
movq [rdi+56], mm3
|
||||
dec rcx
|
||||
jnz .submbv_loop
|
||||
|
||||
pop rbx
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
|
@ -71,83 +71,10 @@ sym(vp8_subtract_b_sse2_impl):
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
|
||||
;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
|
||||
;unsigned char *pred, int pred_stride)
|
||||
global sym(vp8_subtract_mby_sse2)
|
||||
sym(vp8_subtract_mby_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 4
|
||||
SAVE_XMM 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rsi, arg(1) ;src
|
||||
mov rdi, arg(0) ;diff
|
||||
|
||||
mov rax, arg(2) ;pred
|
||||
movsxd rdx, dword ptr arg(3) ;stride
|
||||
|
||||
mov rcx, 8 ; do two lines at one time
|
||||
|
||||
.submby_loop:
|
||||
movdqa xmm0, XMMWORD PTR [rsi] ; src
|
||||
movdqa xmm1, XMMWORD PTR [rax] ; pred
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa XMMWORD PTR [rdi], xmm0
|
||||
movdqa XMMWORD PTR [rdi +16], xmm2
|
||||
|
||||
movdqa xmm4, XMMWORD PTR [rsi + rdx]
|
||||
movdqa xmm5, XMMWORD PTR [rax + 16]
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
psubb xmm4, xmm5
|
||||
|
||||
pxor xmm5, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm6, [GLOBAL(t80)]
|
||||
pcmpgtb xmm5, xmm6 ; obtain sign information
|
||||
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm7, xmm5
|
||||
punpcklbw xmm4, xmm5 ; put sign back to subtraction
|
||||
punpckhbw xmm6, xmm7 ; put sign back to subtraction
|
||||
|
||||
movdqa XMMWORD PTR [rdi +32], xmm4
|
||||
movdqa XMMWORD PTR [rdi +48], xmm6
|
||||
|
||||
add rdi, 64
|
||||
add rax, 32
|
||||
lea rsi, [rsi+rdx*2]
|
||||
|
||||
sub rcx, 1
|
||||
jnz .submby_loop
|
||||
|
||||
pop rdi
|
||||
pop rsi
|
||||
; begin epilog
|
||||
RESTORE_GOT
|
||||
RESTORE_XMM
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
|
||||
;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
|
||||
global sym(vp8_subtract_mbuv_sse2)
|
||||
sym(vp8_subtract_mbuv_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 5
|
||||
@ -156,192 +83,154 @@ sym(vp8_subtract_mbuv_sse2):
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
mov rdi, arg(0) ;diff
|
||||
mov rax, arg(3) ;pred
|
||||
mov rsi, arg(1) ;z = usrc
|
||||
add rdi, 256*2 ;diff = diff + 256 (shorts)
|
||||
add rax, 256 ;Predictor = pred + 256
|
||||
movsxd rdx, dword ptr arg(4) ;stride;
|
||||
lea rcx, [rdx + rdx*2]
|
||||
mov rdi, arg(0) ;diff
|
||||
mov rsi, arg(1) ;src
|
||||
movsxd rdx, dword ptr arg(2);src_stride
|
||||
mov rax, arg(3) ;pred
|
||||
movdqa xmm4, [GLOBAL(t80)]
|
||||
push rbx
|
||||
mov rcx, 8 ; do two lines at one time
|
||||
movsxd rbx, dword ptr arg(4);pred_stride
|
||||
|
||||
;u
|
||||
;line 0 1
|
||||
movq xmm0, MMWORD PTR [rsi] ; src
|
||||
movq xmm2, MMWORD PTR [rsi+rdx]
|
||||
movdqa xmm1, XMMWORD PTR [rax] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
.submby_loop:
|
||||
movdqa xmm0, [rsi] ; src
|
||||
movdqa xmm1, [rax] ; pred
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
pxor xmm1, xmm4 ;convert to signed values
|
||||
pxor xmm2, xmm4
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
movdqa xmm2, xmm0
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm1 ; put sign back to subtraction
|
||||
|
||||
movdqa XMMWORD PTR [rdi], xmm0
|
||||
movdqa XMMWORD PTR [rdi +16], xmm2
|
||||
movdqa xmm3, [rsi + rdx]
|
||||
movdqa xmm5, [rax + rbx]
|
||||
|
||||
;line 2 3
|
||||
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
|
||||
movq xmm2, MMWORD PTR [rsi+rcx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+16] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
lea rsi, [rsi+rdx*2]
|
||||
lea rax, [rax+rbx*2]
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
movdqa [rdi], xmm0
|
||||
movdqa [rdi +16], xmm2
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
movdqa xmm1, xmm3
|
||||
psubb xmm3, xmm5
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
pxor xmm5, xmm4 ;convert to signed values
|
||||
pxor xmm1, xmm4
|
||||
pcmpgtb xmm5, xmm1 ; obtain sign information
|
||||
|
||||
movdqa XMMWORD PTR [rdi + 32], xmm0
|
||||
movdqa XMMWORD PTR [rdi + 48], xmm2
|
||||
movdqa xmm1, xmm3
|
||||
punpcklbw xmm3, xmm5 ; put sign back to subtraction
|
||||
punpckhbw xmm1, xmm5 ; put sign back to subtraction
|
||||
|
||||
;line 4 5
|
||||
lea rsi, [rsi + rdx*4]
|
||||
movdqa [rdi +32], xmm3
|
||||
movdqa [rdi +48], xmm1
|
||||
|
||||
movq xmm0, MMWORD PTR [rsi] ; src
|
||||
movq xmm2, MMWORD PTR [rsi+rdx]
|
||||
movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
add rdi, 64
|
||||
dec rcx
|
||||
jnz .submby_loop
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
pop rbx
|
||||
pop rdi
|
||||
pop rsi
|
||||
; begin epilog
|
||||
RESTORE_GOT
|
||||
UNSHADOW_ARGS
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
|
||||
; int src_stride, unsigned char *upred,
|
||||
; unsigned char *vpred, int pred_stride)
|
||||
global sym(vp8_subtract_mbuv_sse2)
|
||||
sym(vp8_subtract_mbuv_sse2):
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
SHADOW_ARGS_TO_STACK 7
|
||||
GET_GOT rbx
|
||||
push rsi
|
||||
push rdi
|
||||
; end prolog
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
movdqa xmm4, [GLOBAL(t80)]
|
||||
mov rdi, arg(0) ;diff
|
||||
mov rsi, arg(1) ;usrc
|
||||
movsxd rdx, dword ptr arg(3);src_stride;
|
||||
mov rax, arg(4) ;upred
|
||||
add rdi, 256*2 ;diff = diff + 256 (shorts)
|
||||
mov rcx, 4
|
||||
push rbx
|
||||
movsxd rbx, dword ptr arg(6);pred_stride
|
||||
|
||||
movdqa XMMWORD PTR [rdi + 64], xmm0
|
||||
movdqa XMMWORD PTR [rdi + 80], xmm2
|
||||
;u
|
||||
.submbu_loop:
|
||||
movq xmm0, [rsi] ; src
|
||||
movq xmm2, [rsi+rdx] ; src -- next line
|
||||
movq xmm1, [rax] ; pred
|
||||
movq xmm3, [rax+rbx] ; pred -- next line
|
||||
lea rsi, [rsi + rdx*2]
|
||||
lea rax, [rax + rbx*2]
|
||||
|
||||
;line 6 7
|
||||
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
|
||||
movq xmm2, MMWORD PTR [rsi+rcx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
punpcklqdq xmm0, xmm2
|
||||
punpcklqdq xmm1, xmm3
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
pxor xmm1, xmm4 ;convert to signed values
|
||||
pxor xmm2, xmm4
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa XMMWORD PTR [rdi + 96], xmm0
|
||||
movdqa XMMWORD PTR [rdi + 112], xmm2
|
||||
movdqa [rdi], xmm0 ; store difference
|
||||
movdqa [rdi +16], xmm2 ; store difference
|
||||
add rdi, 32
|
||||
sub rcx, 1
|
||||
jnz .submbu_loop
|
||||
|
||||
;v
|
||||
mov rsi, arg(2) ;z = vsrc
|
||||
add rdi, 64*2 ;diff = diff + 320 (shorts)
|
||||
add rax, 64 ;Predictor = pred + 320
|
||||
mov rsi, arg(2) ;vsrc
|
||||
mov rax, arg(5) ;vpred
|
||||
mov rcx, 4
|
||||
|
||||
;line 0 1
|
||||
movq xmm0, MMWORD PTR [rsi] ; src
|
||||
movq xmm2, MMWORD PTR [rsi+rdx]
|
||||
movdqa xmm1, XMMWORD PTR [rax] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
;v
|
||||
.submbv_loop:
|
||||
movq xmm0, [rsi] ; src
|
||||
movq xmm2, [rsi+rdx] ; src -- next line
|
||||
movq xmm1, [rax] ; pred
|
||||
movq xmm3, [rax+rbx] ; pred -- next line
|
||||
lea rsi, [rsi + rdx*2]
|
||||
lea rax, [rax + rbx*2]
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
punpcklqdq xmm0, xmm2
|
||||
punpcklqdq xmm1, xmm3
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
pxor xmm1, xmm4 ;convert to signed values
|
||||
pxor xmm2, xmm4
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa XMMWORD PTR [rdi], xmm0
|
||||
movdqa XMMWORD PTR [rdi +16], xmm2
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
;line 2 3
|
||||
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
|
||||
movq xmm2, MMWORD PTR [rsi+rcx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+16] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa XMMWORD PTR [rdi + 32], xmm0
|
||||
movdqa XMMWORD PTR [rdi + 48], xmm2
|
||||
|
||||
;line 4 5
|
||||
lea rsi, [rsi + rdx*4]
|
||||
|
||||
movq xmm0, MMWORD PTR [rsi] ; src
|
||||
movq xmm2, MMWORD PTR [rsi+rdx]
|
||||
movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa XMMWORD PTR [rdi + 64], xmm0
|
||||
movdqa XMMWORD PTR [rdi + 80], xmm2
|
||||
|
||||
;line 6 7
|
||||
movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
|
||||
movq xmm2, MMWORD PTR [rsi+rcx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
|
||||
punpcklqdq xmm0, xmm2
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
psubb xmm0, xmm1 ; subtraction with sign missed
|
||||
|
||||
pxor xmm1, [GLOBAL(t80)] ;convert to signed values
|
||||
pxor xmm2, [GLOBAL(t80)]
|
||||
pcmpgtb xmm1, xmm2 ; obtain sign information
|
||||
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
punpcklbw xmm0, xmm1 ; put sign back to subtraction
|
||||
punpckhbw xmm2, xmm3 ; put sign back to subtraction
|
||||
|
||||
movdqa XMMWORD PTR [rdi + 96], xmm0
|
||||
movdqa XMMWORD PTR [rdi + 112], xmm2
|
||||
movdqa [rdi], xmm0 ; store difference
|
||||
movdqa [rdi +16], xmm2 ; store difference
|
||||
add rdi, 32
|
||||
sub rcx, 1
|
||||
jnz .submbv_loop
|
||||
|
||||
pop rbx
|
||||
; begin epilog
|
||||
pop rdi
|
||||
pop rsi
|
||||
|
Loading…
x
Reference in New Issue
Block a user