Move variance functions to vpx_dsp
subpel functions will be moved in another patch. Change-Id: Idb2e049bad0b9b32ac42cc7731cd6903de2826ce
This commit is contained in:
parent
976f7f42c1
commit
c3bdffb0a5
File diff suppressed because it is too large
Load Diff
@ -1,154 +0,0 @@
|
|||||||
;
|
|
||||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
|
||||||
;
|
|
||||||
; Use of this source code is governed by a BSD-style license
|
|
||||||
; that can be found in the LICENSE file in the root of the source
|
|
||||||
; tree. An additional intellectual property rights grant can be found
|
|
||||||
; in the file PATENTS. All contributing project authors may
|
|
||||||
; be found in the AUTHORS file in the root of the source tree.
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
EXPORT |vp8_variance16x16_armv6|
|
|
||||||
|
|
||||||
ARM
|
|
||||||
REQUIRE8
|
|
||||||
PRESERVE8
|
|
||||||
|
|
||||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
|
||||||
|
|
||||||
; r0 unsigned char *src_ptr
|
|
||||||
; r1 int source_stride
|
|
||||||
; r2 unsigned char *ref_ptr
|
|
||||||
; r3 int recon_stride
|
|
||||||
; stack unsigned int *sse
|
|
||||||
|vp8_variance16x16_armv6| PROC
|
|
||||||
|
|
||||||
stmfd sp!, {r4-r12, lr}
|
|
||||||
|
|
||||||
pld [r0, r1, lsl #0]
|
|
||||||
pld [r2, r3, lsl #0]
|
|
||||||
|
|
||||||
mov r8, #0 ; initialize sum = 0
|
|
||||||
mov r11, #0 ; initialize sse = 0
|
|
||||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
|
||||||
|
|
||||||
loop
|
|
||||||
; 1st 4 pixels
|
|
||||||
ldr r4, [r0, #0] ; load 4 src pixels
|
|
||||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
|
||||||
|
|
||||||
mov lr, #0 ; constant zero
|
|
||||||
|
|
||||||
usub8 r6, r4, r5 ; calculate difference
|
|
||||||
pld [r0, r1, lsl #1]
|
|
||||||
sel r7, r6, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
|
||||||
pld [r2, r3, lsl #1]
|
|
||||||
sel r6, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
|
||||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
|
||||||
orr r6, r6, r7 ; differences of all 4 pixels
|
|
||||||
; calculate total sum
|
|
||||||
adds r8, r8, r4 ; add positive differences to sum
|
|
||||||
subs r8, r8, r5 ; subtract negative differences from sum
|
|
||||||
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
|
||||||
|
|
||||||
; 2nd 4 pixels
|
|
||||||
ldr r4, [r0, #4] ; load 4 src pixels
|
|
||||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
|
||||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
usub8 r6, r4, r5 ; calculate difference
|
|
||||||
sel r7, r6, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
|
||||||
sel r6, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
|
||||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
|
||||||
orr r6, r6, r7 ; differences of all 4 pixels
|
|
||||||
|
|
||||||
; calculate total sum
|
|
||||||
add r8, r8, r4 ; add positive differences to sum
|
|
||||||
sub r8, r8, r5 ; subtract negative differences from sum
|
|
||||||
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
|
||||||
|
|
||||||
; 3rd 4 pixels
|
|
||||||
ldr r4, [r0, #8] ; load 4 src pixels
|
|
||||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
|
||||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
usub8 r6, r4, r5 ; calculate difference
|
|
||||||
sel r7, r6, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
|
||||||
sel r6, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
|
||||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
|
||||||
orr r6, r6, r7 ; differences of all 4 pixels
|
|
||||||
|
|
||||||
; calculate total sum
|
|
||||||
add r8, r8, r4 ; add positive differences to sum
|
|
||||||
sub r8, r8, r5 ; subtract negative differences from sum
|
|
||||||
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
|
||||||
|
|
||||||
; 4th 4 pixels
|
|
||||||
ldr r4, [r0, #12] ; load 4 src pixels
|
|
||||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
|
||||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
usub8 r6, r4, r5 ; calculate difference
|
|
||||||
add r0, r0, r1 ; set src_ptr to next row
|
|
||||||
sel r7, r6, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
|
||||||
add r2, r2, r3 ; set dst_ptr to next row
|
|
||||||
sel r6, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
|
||||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
|
||||||
orr r6, r6, r7 ; differences of all 4 pixels
|
|
||||||
|
|
||||||
; calculate total sum
|
|
||||||
add r8, r8, r4 ; add positive differences to sum
|
|
||||||
sub r8, r8, r5 ; subtract negative differences from sum
|
|
||||||
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
|
||||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
|
|
||||||
subs r12, r12, #1
|
|
||||||
|
|
||||||
bne loop
|
|
||||||
|
|
||||||
; return stuff
|
|
||||||
ldr r6, [sp, #40] ; get address of sse
|
|
||||||
mul r0, r8, r8 ; sum * sum
|
|
||||||
str r11, [r6] ; store sse
|
|
||||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
|
||||||
|
|
||||||
ldmfd sp!, {r4-r12, pc}
|
|
||||||
|
|
||||||
ENDP
|
|
||||||
|
|
||||||
END
|
|
||||||
|
|
@ -1,101 +0,0 @@
|
|||||||
;
|
|
||||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
|
||||||
;
|
|
||||||
; Use of this source code is governed by a BSD-style license
|
|
||||||
; that can be found in the LICENSE file in the root of the source
|
|
||||||
; tree. An additional intellectual property rights grant can be found
|
|
||||||
; in the file PATENTS. All contributing project authors may
|
|
||||||
; be found in the AUTHORS file in the root of the source tree.
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
EXPORT |vp8_variance8x8_armv6|
|
|
||||||
|
|
||||||
ARM
|
|
||||||
|
|
||||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
|
||||||
|
|
||||||
; r0 unsigned char *src_ptr
|
|
||||||
; r1 int source_stride
|
|
||||||
; r2 unsigned char *ref_ptr
|
|
||||||
; r3 int recon_stride
|
|
||||||
; stack unsigned int *sse
|
|
||||||
|vp8_variance8x8_armv6| PROC
|
|
||||||
|
|
||||||
push {r4-r10, lr}
|
|
||||||
|
|
||||||
pld [r0, r1, lsl #0]
|
|
||||||
pld [r2, r3, lsl #0]
|
|
||||||
|
|
||||||
mov r12, #8 ; set loop counter to 8 (=block height)
|
|
||||||
mov r4, #0 ; initialize sum = 0
|
|
||||||
mov r5, #0 ; initialize sse = 0
|
|
||||||
|
|
||||||
loop
|
|
||||||
; 1st 4 pixels
|
|
||||||
ldr r6, [r0, #0x0] ; load 4 src pixels
|
|
||||||
ldr r7, [r2, #0x0] ; load 4 ref pixels
|
|
||||||
|
|
||||||
mov lr, #0 ; constant zero
|
|
||||||
|
|
||||||
usub8 r8, r6, r7 ; calculate difference
|
|
||||||
pld [r0, r1, lsl #1]
|
|
||||||
sel r10, r8, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
|
||||||
pld [r2, r3, lsl #1]
|
|
||||||
sel r8, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
|
||||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
|
||||||
orr r8, r8, r10 ; differences of all 4 pixels
|
|
||||||
; calculate total sum
|
|
||||||
add r4, r4, r6 ; add positive differences to sum
|
|
||||||
sub r4, r4, r7 ; subtract negative differences from sum
|
|
||||||
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
|
||||||
|
|
||||||
; 2nd 4 pixels
|
|
||||||
ldr r6, [r0, #0x4] ; load 4 src pixels
|
|
||||||
ldr r7, [r2, #0x4] ; load 4 ref pixels
|
|
||||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
usub8 r8, r6, r7 ; calculate difference
|
|
||||||
add r0, r0, r1 ; set src_ptr to next row
|
|
||||||
sel r10, r8, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
|
||||||
add r2, r2, r3 ; set dst_ptr to next row
|
|
||||||
sel r8, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
|
||||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
|
||||||
orr r8, r8, r10 ; differences of all 4 pixels
|
|
||||||
|
|
||||||
; calculate total sum
|
|
||||||
add r4, r4, r6 ; add positive differences to sum
|
|
||||||
sub r4, r4, r7 ; subtract negative differences from sum
|
|
||||||
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
|
||||||
subs r12, r12, #1 ; next row
|
|
||||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
bne loop
|
|
||||||
|
|
||||||
; return stuff
|
|
||||||
ldr r8, [sp, #32] ; get address of sse
|
|
||||||
mul r1, r4, r4 ; sum * sum
|
|
||||||
str r5, [r8] ; store sse
|
|
||||||
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
|
|
||||||
|
|
||||||
pop {r4-r10, pc}
|
|
||||||
|
|
||||||
ENDP
|
|
||||||
|
|
||||||
END
|
|
@ -1,320 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <arm_neon.h>
|
|
||||||
#include "vpx_ports/mem.h"
|
|
||||||
|
|
||||||
unsigned int vp8_variance16x16_neon(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int i;
|
|
||||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
|
||||||
uint32x2_t d0u32, d10u32;
|
|
||||||
int64x1_t d0s64, d1s64;
|
|
||||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
|
||||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
|
||||||
int32x4_t q8s32, q9s32, q10s32;
|
|
||||||
int64x2_t q0s64, q1s64, q5s64;
|
|
||||||
|
|
||||||
q8s32 = vdupq_n_s32(0);
|
|
||||||
q9s32 = vdupq_n_s32(0);
|
|
||||||
q10s32 = vdupq_n_s32(0);
|
|
||||||
|
|
||||||
for (i = 0; i < 8; i++) {
|
|
||||||
q0u8 = vld1q_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
q1u8 = vld1q_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
__builtin_prefetch(src_ptr);
|
|
||||||
|
|
||||||
q2u8 = vld1q_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
q3u8 = vld1q_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
__builtin_prefetch(ref_ptr);
|
|
||||||
|
|
||||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
|
||||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
|
||||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
|
||||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
|
||||||
|
|
||||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
|
||||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
|
||||||
|
|
||||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
|
||||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
|
||||||
|
|
||||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
|
||||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
|
||||||
|
|
||||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
|
||||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
|
||||||
}
|
|
||||||
|
|
||||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
|
||||||
q0s64 = vpaddlq_s32(q8s32);
|
|
||||||
q1s64 = vpaddlq_s32(q10s32);
|
|
||||||
|
|
||||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
|
||||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
|
||||||
|
|
||||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
|
||||||
vreinterpret_s32_s64(d0s64));
|
|
||||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
|
||||||
|
|
||||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
|
|
||||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
|
||||||
|
|
||||||
return vget_lane_u32(d0u32, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance16x8_neon(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int i;
|
|
||||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
|
||||||
uint32x2_t d0u32, d10u32;
|
|
||||||
int64x1_t d0s64, d1s64;
|
|
||||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
|
||||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
|
||||||
int32x4_t q8s32, q9s32, q10s32;
|
|
||||||
int64x2_t q0s64, q1s64, q5s64;
|
|
||||||
|
|
||||||
q8s32 = vdupq_n_s32(0);
|
|
||||||
q9s32 = vdupq_n_s32(0);
|
|
||||||
q10s32 = vdupq_n_s32(0);
|
|
||||||
|
|
||||||
for (i = 0; i < 4; i++) { // variance16x8_neon_loop
|
|
||||||
q0u8 = vld1q_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
q1u8 = vld1q_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
__builtin_prefetch(src_ptr);
|
|
||||||
|
|
||||||
q2u8 = vld1q_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
q3u8 = vld1q_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
__builtin_prefetch(ref_ptr);
|
|
||||||
|
|
||||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
|
||||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
|
||||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
|
||||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
|
||||||
|
|
||||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
|
||||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
|
||||||
|
|
||||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
|
||||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
|
||||||
|
|
||||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
|
||||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
|
||||||
|
|
||||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
|
||||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
|
||||||
}
|
|
||||||
|
|
||||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
|
||||||
q0s64 = vpaddlq_s32(q8s32);
|
|
||||||
q1s64 = vpaddlq_s32(q10s32);
|
|
||||||
|
|
||||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
|
||||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
|
||||||
|
|
||||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
|
||||||
vreinterpret_s32_s64(d0s64));
|
|
||||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
|
||||||
|
|
||||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
|
||||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
|
||||||
|
|
||||||
return vget_lane_u32(d0u32, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance8x16_neon(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int i;
|
|
||||||
uint8x8_t d0u8, d2u8, d4u8, d6u8;
|
|
||||||
int16x4_t d22s16, d23s16, d24s16, d25s16;
|
|
||||||
uint32x2_t d0u32, d10u32;
|
|
||||||
int64x1_t d0s64, d1s64;
|
|
||||||
uint16x8_t q11u16, q12u16;
|
|
||||||
int32x4_t q8s32, q9s32, q10s32;
|
|
||||||
int64x2_t q0s64, q1s64, q5s64;
|
|
||||||
|
|
||||||
q8s32 = vdupq_n_s32(0);
|
|
||||||
q9s32 = vdupq_n_s32(0);
|
|
||||||
q10s32 = vdupq_n_s32(0);
|
|
||||||
|
|
||||||
for (i = 0; i < 8; i++) { // variance8x16_neon_loop
|
|
||||||
d0u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
d2u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
__builtin_prefetch(src_ptr);
|
|
||||||
|
|
||||||
d4u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
d6u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
__builtin_prefetch(ref_ptr);
|
|
||||||
|
|
||||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
|
||||||
q12u16 = vsubl_u8(d2u8, d6u8);
|
|
||||||
|
|
||||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
|
||||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
|
||||||
|
|
||||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
|
||||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
|
||||||
}
|
|
||||||
|
|
||||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
|
||||||
q0s64 = vpaddlq_s32(q8s32);
|
|
||||||
q1s64 = vpaddlq_s32(q10s32);
|
|
||||||
|
|
||||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
|
||||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
|
||||||
|
|
||||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
|
||||||
vreinterpret_s32_s64(d0s64));
|
|
||||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
|
||||||
|
|
||||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
|
||||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
|
||||||
|
|
||||||
return vget_lane_u32(d0u32, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance8x8_neon(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int i;
|
|
||||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
|
||||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
|
||||||
uint32x2_t d0u32, d10u32;
|
|
||||||
int64x1_t d0s64, d1s64;
|
|
||||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
|
||||||
int32x4_t q8s32, q9s32, q10s32;
|
|
||||||
int64x2_t q0s64, q1s64, q5s64;
|
|
||||||
|
|
||||||
q8s32 = vdupq_n_s32(0);
|
|
||||||
q9s32 = vdupq_n_s32(0);
|
|
||||||
q10s32 = vdupq_n_s32(0);
|
|
||||||
|
|
||||||
for (i = 0; i < 2; i++) { // variance8x8_neon_loop
|
|
||||||
d0u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
d1u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
d2u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
d3u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
|
|
||||||
d4u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
d5u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
d6u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
d7u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
|
|
||||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
|
||||||
q12u16 = vsubl_u8(d1u8, d5u8);
|
|
||||||
q13u16 = vsubl_u8(d2u8, d6u8);
|
|
||||||
q14u16 = vsubl_u8(d3u8, d7u8);
|
|
||||||
|
|
||||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
|
||||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
|
||||||
|
|
||||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
|
||||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
|
||||||
|
|
||||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
|
||||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
|
||||||
|
|
||||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
|
||||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
|
||||||
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
|
||||||
}
|
|
||||||
|
|
||||||
q10s32 = vaddq_s32(q10s32, q9s32);
|
|
||||||
q0s64 = vpaddlq_s32(q8s32);
|
|
||||||
q1s64 = vpaddlq_s32(q10s32);
|
|
||||||
|
|
||||||
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
|
||||||
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
|
||||||
|
|
||||||
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
|
||||||
vreinterpret_s32_s64(d0s64));
|
|
||||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
|
||||||
|
|
||||||
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
|
|
||||||
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
|
||||||
|
|
||||||
return vget_lane_u32(d0u32, 0);
|
|
||||||
}
|
|
@ -9,10 +9,14 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "vpx_config.h"
|
#include "vpx_config.h"
|
||||||
#include "vp8_rtcd.h"
|
#include "./vp8_rtcd.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "vp8/common/variance.h"
|
#include "vp8/common/variance.h"
|
||||||
#include "vp8/common/filter.h"
|
#include "vp8/common/filter.h"
|
||||||
|
|
||||||
|
// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
|
||||||
|
#if CONFIG_VP8_ENCODER
|
||||||
|
|
||||||
#if HAVE_MEDIA
|
#if HAVE_MEDIA
|
||||||
#include "vp8/common/arm/bilinearfilter_arm.h"
|
#include "vp8/common/arm/bilinearfilter_arm.h"
|
||||||
|
|
||||||
@ -40,8 +44,8 @@ unsigned int vp8_sub_pixel_variance8x8_armv6
|
|||||||
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
|
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
|
||||||
8, 8, 8, VFilter);
|
8, 8, 8, VFilter);
|
||||||
|
|
||||||
return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
|
return vpx_variance8x8_media(second_pass, 8, dst_ptr,
|
||||||
dst_pixels_per_line, sse);
|
dst_pixels_per_line, sse);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp8_sub_pixel_variance16x16_armv6
|
unsigned int vp8_sub_pixel_variance16x16_armv6
|
||||||
@ -86,13 +90,13 @@ unsigned int vp8_sub_pixel_variance16x16_armv6
|
|||||||
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
|
vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
|
||||||
16, 16, 16, VFilter);
|
16, 16, 16, VFilter);
|
||||||
|
|
||||||
var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
|
var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
|
||||||
dst_pixels_per_line, sse);
|
dst_pixels_per_line, sse);
|
||||||
}
|
}
|
||||||
return var;
|
return var;
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* HAVE_MEDIA */
|
#endif // HAVE_MEDIA
|
||||||
|
|
||||||
|
|
||||||
#if HAVE_NEON
|
#if HAVE_NEON
|
||||||
@ -129,4 +133,5 @@ unsigned int vp8_sub_pixel_variance16x16_neon
|
|||||||
return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
|
return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif // HAVE_NEON
|
||||||
|
#endif // CONFIG_VP8_ENCODER
|
||||||
|
@ -151,14 +151,14 @@ static void multiframe_quality_enhance_block
|
|||||||
|
|
||||||
if (blksize == 16)
|
if (blksize == 16)
|
||||||
{
|
{
|
||||||
actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
|
actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
|
||||||
act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
|
act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
|
||||||
#ifdef USE_SSD
|
#ifdef USE_SSD
|
||||||
vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
|
vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
|
||||||
sad = (sse + 128)>>8;
|
sad = (sse + 128)>>8;
|
||||||
vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
|
vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
|
||||||
usad = (sse + 32)>>6;
|
usad = (sse + 32)>>6;
|
||||||
vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
|
vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
|
||||||
vsad = (sse + 32)>>6;
|
vsad = (sse + 32)>>6;
|
||||||
#else
|
#else
|
||||||
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
|
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
|
||||||
@ -168,14 +168,14 @@ static void multiframe_quality_enhance_block
|
|||||||
}
|
}
|
||||||
else /* if (blksize == 8) */
|
else /* if (blksize == 8) */
|
||||||
{
|
{
|
||||||
actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
|
actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
|
||||||
act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
|
act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
|
||||||
#ifdef USE_SSD
|
#ifdef USE_SSD
|
||||||
vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
|
vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
|
||||||
sad = (sse + 32)>>6;
|
sad = (sse + 32)>>6;
|
||||||
vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
|
vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
|
||||||
usad = (sse + 8)>>4;
|
usad = (sse + 8)>>4;
|
||||||
vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
|
vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
|
||||||
vsad = (sse + 8)>>4;
|
vsad = (sse + 8)>>4;
|
||||||
#else
|
#else
|
||||||
sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
|
sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;
|
||||||
|
@ -236,31 +236,6 @@ add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch,
|
|||||||
specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
|
specialize qw/vp8_bilinear_predict4x4 mmx media neon/;
|
||||||
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
|
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
|
||||||
|
|
||||||
#
|
|
||||||
# Whole-pixel Variance
|
|
||||||
#
|
|
||||||
add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp8_variance4x4 mmx sse2/;
|
|
||||||
$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp8_variance8x8 mmx sse2 media neon/;
|
|
||||||
$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
|
|
||||||
$vp8_variance8x8_media=vp8_variance8x8_armv6;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp8_variance8x16 mmx sse2 neon/;
|
|
||||||
$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp8_variance16x8 mmx sse2 neon/;
|
|
||||||
$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp8_variance16x16 mmx sse2 media neon/;
|
|
||||||
$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
|
|
||||||
$vp8_variance16x16_media=vp8_variance16x16_armv6;
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Sub-pixel Variance
|
# Sub-pixel Variance
|
||||||
#
|
#
|
||||||
@ -308,12 +283,6 @@ $vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
|
|||||||
#
|
#
|
||||||
if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
|
if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
|
||||||
|
|
||||||
#
|
|
||||||
# Sum of squares (vector)
|
|
||||||
#
|
|
||||||
add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
|
|
||||||
specialize qw/vp8_get_mb_ss mmx sse2/;
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# SSE (Sum Squared Error)
|
# SSE (Sum Squared Error)
|
||||||
#
|
#
|
||||||
@ -321,14 +290,6 @@ add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char *src_pt
|
|||||||
specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
|
specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
|
||||||
$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
|
$vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
|
||||||
|
|
||||||
add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp8_mse16x16 mmx sse2 media neon/;
|
|
||||||
$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
|
|
||||||
$vp8_mse16x16_media=vp8_mse16x16_armv6;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
|
|
||||||
specialize qw/vp8_get4x4sse_cs mmx neon/;
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Block copy
|
# Block copy
|
||||||
#
|
#
|
||||||
|
@ -39,6 +39,7 @@ typedef void (*vpx_sad_multi_fn_t)(
|
|||||||
const unsigned char *ref_array,
|
const unsigned char *ref_array,
|
||||||
int ref_stride,
|
int ref_stride,
|
||||||
unsigned int *sad_array);
|
unsigned int *sad_array);
|
||||||
|
|
||||||
typedef void (*vpx_sad_multi_d_fn_t)
|
typedef void (*vpx_sad_multi_d_fn_t)
|
||||||
(
|
(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
@ -48,7 +49,7 @@ typedef void (*vpx_sad_multi_d_fn_t)
|
|||||||
unsigned int *sad_array
|
unsigned int *sad_array
|
||||||
);
|
);
|
||||||
|
|
||||||
typedef unsigned int (*vp8_variance_fn_t)
|
typedef unsigned int (*vpx_variance_fn_t)
|
||||||
(
|
(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
int source_stride,
|
int source_stride,
|
||||||
@ -68,37 +69,14 @@ typedef unsigned int (*vp8_subpixvariance_fn_t)
|
|||||||
unsigned int *sse
|
unsigned int *sse
|
||||||
);
|
);
|
||||||
|
|
||||||
typedef void (*vp8_ssimpf_fn_t)
|
|
||||||
(
|
|
||||||
unsigned char *s,
|
|
||||||
int sp,
|
|
||||||
unsigned char *r,
|
|
||||||
int rp,
|
|
||||||
unsigned long *sum_s,
|
|
||||||
unsigned long *sum_r,
|
|
||||||
unsigned long *sum_sq_s,
|
|
||||||
unsigned long *sum_sq_r,
|
|
||||||
unsigned long *sum_sxr
|
|
||||||
);
|
|
||||||
|
|
||||||
typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
|
|
||||||
|
|
||||||
typedef unsigned int (*vp8_get16x16prederror_fn_t)
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int ref_stride
|
|
||||||
);
|
|
||||||
|
|
||||||
typedef struct variance_vtable
|
typedef struct variance_vtable
|
||||||
{
|
{
|
||||||
vpx_sad_fn_t sdf;
|
vpx_sad_fn_t sdf;
|
||||||
vp8_variance_fn_t vf;
|
vpx_variance_fn_t vf;
|
||||||
vp8_subpixvariance_fn_t svf;
|
vp8_subpixvariance_fn_t svf;
|
||||||
vp8_variance_fn_t svf_halfpix_h;
|
vpx_variance_fn_t svf_halfpix_h;
|
||||||
vp8_variance_fn_t svf_halfpix_v;
|
vpx_variance_fn_t svf_halfpix_v;
|
||||||
vp8_variance_fn_t svf_halfpix_hv;
|
vpx_variance_fn_t svf_halfpix_hv;
|
||||||
vpx_sad_multi_fn_t sdx3f;
|
vpx_sad_multi_fn_t sdx3f;
|
||||||
vpx_sad_multi_fn_t sdx8f;
|
vpx_sad_multi_fn_t sdx8f;
|
||||||
vpx_sad_multi_d_fn_t sdx4df;
|
vpx_sad_multi_d_fn_t sdx4df;
|
||||||
|
@ -8,44 +8,34 @@
|
|||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
#include "./vp8_rtcd.h"
|
#include "./vp8_rtcd.h"
|
||||||
#include "filter.h"
|
#include "filter.h"
|
||||||
#include "variance.h"
|
#include "variance.h"
|
||||||
|
|
||||||
|
/* This is a bad idea.
|
||||||
unsigned int vp8_get_mb_ss_c
|
* ctz = count trailing zeros */
|
||||||
(
|
static int ctz(int a) {
|
||||||
const short *src_ptr
|
int b = 0;
|
||||||
)
|
while (a != 1) {
|
||||||
{
|
a >>= 1;
|
||||||
unsigned int i = 0, sum = 0;
|
b++;
|
||||||
|
}
|
||||||
do
|
return b;
|
||||||
{
|
|
||||||
sum += (src_ptr[i] * src_ptr[i]);
|
|
||||||
i++;
|
|
||||||
}
|
|
||||||
while (i < 256);
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static unsigned int variance(
|
||||||
static void variance(
|
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
int source_stride,
|
int source_stride,
|
||||||
const unsigned char *ref_ptr,
|
const unsigned char *ref_ptr,
|
||||||
int recon_stride,
|
int recon_stride,
|
||||||
int w,
|
int w,
|
||||||
int h,
|
int h,
|
||||||
unsigned int *sse,
|
unsigned int *sse)
|
||||||
int *sum)
|
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
int diff;
|
int diff, sum;
|
||||||
|
|
||||||
*sum = 0;
|
sum = 0;
|
||||||
*sse = 0;
|
*sse = 0;
|
||||||
|
|
||||||
for (i = 0; i < h; i++)
|
for (i = 0; i < h; i++)
|
||||||
@ -53,114 +43,17 @@ static void variance(
|
|||||||
for (j = 0; j < w; j++)
|
for (j = 0; j < w; j++)
|
||||||
{
|
{
|
||||||
diff = src_ptr[j] - ref_ptr[j];
|
diff = src_ptr[j] - ref_ptr[j];
|
||||||
*sum += diff;
|
sum += diff;
|
||||||
*sse += diff * diff;
|
*sse += diff * diff;
|
||||||
}
|
}
|
||||||
|
|
||||||
src_ptr += source_stride;
|
src_ptr += source_stride;
|
||||||
ref_ptr += recon_stride;
|
ref_ptr += recon_stride;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_variance16x16_c(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 8));
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance8x16_c(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 7));
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance16x8_c(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 7));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_variance8x8_c(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 6));
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance4x4_c(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 4));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_mse16x16_c(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
|
|
||||||
*sse = var;
|
|
||||||
return var;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/****************************************************************************
|
/****************************************************************************
|
||||||
*
|
*
|
||||||
* ROUTINE : filter_block2d_bil_first_pass
|
* ROUTINE : filter_block2d_bil_first_pass
|
||||||
@ -304,7 +197,7 @@ unsigned int vp8_sub_pixel_variance4x4_c
|
|||||||
/* Now filter Verticaly */
|
/* Now filter Verticaly */
|
||||||
var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
|
var_filter_block2d_bil_second_pass(FData3, temp2, 4, 4, 4, 4, VFilter);
|
||||||
|
|
||||||
return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
|
return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -329,7 +222,7 @@ unsigned int vp8_sub_pixel_variance8x8_c
|
|||||||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
|
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
|
||||||
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
|
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
|
||||||
|
|
||||||
return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
|
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp8_sub_pixel_variance16x16_c
|
unsigned int vp8_sub_pixel_variance16x16_c
|
||||||
@ -353,7 +246,7 @@ unsigned int vp8_sub_pixel_variance16x16_c
|
|||||||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
|
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
|
||||||
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
|
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
|
||||||
|
|
||||||
return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
|
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -429,7 +322,7 @@ unsigned int vp8_sub_pixel_variance16x8_c
|
|||||||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
|
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
|
||||||
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
|
var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
|
||||||
|
|
||||||
return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
|
return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp8_sub_pixel_variance8x16_c
|
unsigned int vp8_sub_pixel_variance8x16_c
|
||||||
@ -455,5 +348,5 @@ unsigned int vp8_sub_pixel_variance8x16_c
|
|||||||
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
|
var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
|
||||||
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
|
var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
|
||||||
|
|
||||||
return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
|
return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
|
||||||
}
|
}
|
||||||
|
@ -11,504 +11,6 @@
|
|||||||
|
|
||||||
%include "vpx_ports/x86_abi_support.asm"
|
%include "vpx_ports/x86_abi_support.asm"
|
||||||
|
|
||||||
;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
|
|
||||||
global sym(vp8_get_mb_ss_mmx) PRIVATE
|
|
||||||
sym(vp8_get_mb_ss_mmx):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 7
|
|
||||||
GET_GOT rbx
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
sub rsp, 8
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
mov rax, arg(0) ;src_ptr
|
|
||||||
mov rcx, 16
|
|
||||||
pxor mm4, mm4
|
|
||||||
|
|
||||||
.NEXTROW:
|
|
||||||
movq mm0, [rax]
|
|
||||||
movq mm1, [rax+8]
|
|
||||||
movq mm2, [rax+16]
|
|
||||||
movq mm3, [rax+24]
|
|
||||||
pmaddwd mm0, mm0
|
|
||||||
pmaddwd mm1, mm1
|
|
||||||
pmaddwd mm2, mm2
|
|
||||||
pmaddwd mm3, mm3
|
|
||||||
|
|
||||||
paddd mm4, mm0
|
|
||||||
paddd mm4, mm1
|
|
||||||
paddd mm4, mm2
|
|
||||||
paddd mm4, mm3
|
|
||||||
|
|
||||||
add rax, 32
|
|
||||||
dec rcx
|
|
||||||
ja .NEXTROW
|
|
||||||
movq QWORD PTR [rsp], mm4
|
|
||||||
|
|
||||||
;return sum[0]+sum[1];
|
|
||||||
movsxd rax, dword ptr [rsp]
|
|
||||||
movsxd rcx, dword ptr [rsp+4]
|
|
||||||
add rax, rcx
|
|
||||||
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
add rsp, 8
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
RESTORE_GOT
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
|
|
||||||
;unsigned int vp8_get8x8var_mmx
|
|
||||||
;(
|
|
||||||
; unsigned char *src_ptr,
|
|
||||||
; int source_stride,
|
|
||||||
; unsigned char *ref_ptr,
|
|
||||||
; int recon_stride,
|
|
||||||
; unsigned int *SSE,
|
|
||||||
; int *Sum
|
|
||||||
;)
|
|
||||||
global sym(vp8_get8x8var_mmx) PRIVATE
|
|
||||||
sym(vp8_get8x8var_mmx):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 6
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
push rbx
|
|
||||||
sub rsp, 16
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
|
|
||||||
pxor mm5, mm5 ; Blank mmx6
|
|
||||||
pxor mm6, mm6 ; Blank mmx7
|
|
||||||
pxor mm7, mm7 ; Blank mmx7
|
|
||||||
|
|
||||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
|
||||||
mov rbx, arg(2) ;[ref_ptr]
|
|
||||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
|
||||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
|
||||||
|
|
||||||
; Row 1
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
|
|
||||||
; Row 2
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 3
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 4
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 5
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
; movq mm4, [rbx + rdx]
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 6
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 7
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movq mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 8
|
|
||||||
movq mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movq mm2, mm0 ; Take copies
|
|
||||||
movq mm3, mm1 ; Take copies
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
|
||||||
punpckhbw mm3, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
psubsw mm2, mm3 ; A-B (high order) to MM2
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
paddw mm5, mm2 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
pmaddwd mm2, mm2 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
paddd mm7, mm2 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Now accumulate the final results.
|
|
||||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
|
||||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
|
||||||
movsx rdx, WORD PTR [rsp+8]
|
|
||||||
movsx rcx, WORD PTR [rsp+10]
|
|
||||||
movsx rbx, WORD PTR [rsp+12]
|
|
||||||
movsx rax, WORD PTR [rsp+14]
|
|
||||||
add rdx, rcx
|
|
||||||
add rbx, rax
|
|
||||||
add rdx, rbx ;XSum
|
|
||||||
movsxd rax, DWORD PTR [rsp]
|
|
||||||
movsxd rcx, DWORD PTR [rsp+4]
|
|
||||||
add rax, rcx ;XXSum
|
|
||||||
mov rsi, arg(4) ;SSE
|
|
||||||
mov rdi, arg(5) ;Sum
|
|
||||||
mov dword ptr [rsi], eax
|
|
||||||
mov dword ptr [rdi], edx
|
|
||||||
xor rax, rax ; return 0
|
|
||||||
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
add rsp, 16
|
|
||||||
pop rbx
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
;unsigned int
|
|
||||||
;vp8_get4x4var_mmx
|
|
||||||
;(
|
|
||||||
; unsigned char *src_ptr,
|
|
||||||
; int source_stride,
|
|
||||||
; unsigned char *ref_ptr,
|
|
||||||
; int recon_stride,
|
|
||||||
; unsigned int *SSE,
|
|
||||||
; int *Sum
|
|
||||||
;)
|
|
||||||
global sym(vp8_get4x4var_mmx) PRIVATE
|
|
||||||
sym(vp8_get4x4var_mmx):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 6
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
push rbx
|
|
||||||
sub rsp, 16
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
|
|
||||||
pxor mm5, mm5 ; Blank mmx6
|
|
||||||
pxor mm6, mm6 ; Blank mmx7
|
|
||||||
pxor mm7, mm7 ; Blank mmx7
|
|
||||||
|
|
||||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
|
||||||
mov rbx, arg(2) ;[ref_ptr]
|
|
||||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
|
||||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
|
||||||
|
|
||||||
; Row 1
|
|
||||||
movd mm0, [rax] ; Copy four bytes to mm0
|
|
||||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
|
|
||||||
; Row 2
|
|
||||||
movd mm0, [rax] ; Copy four bytes to mm0
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 3
|
|
||||||
movd mm0, [rax] ; Copy four bytes to mm0
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher precision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movd mm1, [rbx] ; Copy four bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 4
|
|
||||||
movd mm0, [rax] ; Copy four bytes to mm0
|
|
||||||
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
|
|
||||||
paddw mm5, mm0 ; accumulate differences in mm5
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
|
|
||||||
; Now accumulate the final results.
|
|
||||||
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
|
||||||
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
|
||||||
movsx rdx, WORD PTR [rsp+8]
|
|
||||||
movsx rcx, WORD PTR [rsp+10]
|
|
||||||
movsx rbx, WORD PTR [rsp+12]
|
|
||||||
movsx rax, WORD PTR [rsp+14]
|
|
||||||
add rdx, rcx
|
|
||||||
add rbx, rax
|
|
||||||
add rdx, rbx ;XSum
|
|
||||||
movsxd rax, DWORD PTR [rsp]
|
|
||||||
movsxd rcx, DWORD PTR [rsp+4]
|
|
||||||
add rax, rcx ;XXSum
|
|
||||||
mov rsi, arg(4) ;SSE
|
|
||||||
mov rdi, arg(5) ;Sum
|
|
||||||
mov dword ptr [rsi], eax
|
|
||||||
mov dword ptr [rdi], edx
|
|
||||||
xor rax, rax ; return 0
|
|
||||||
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
add rsp, 16
|
|
||||||
pop rbx
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
;unsigned int
|
|
||||||
;vp8_get4x4sse_cs_mmx
|
|
||||||
;(
|
|
||||||
; unsigned char *src_ptr,
|
|
||||||
; int source_stride,
|
|
||||||
; unsigned char *ref_ptr,
|
|
||||||
; int recon_stride
|
|
||||||
;)
|
|
||||||
global sym(vp8_get4x4sse_cs_mmx) PRIVATE
|
|
||||||
sym(vp8_get4x4sse_cs_mmx):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 4
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
push rbx
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
|
|
||||||
pxor mm6, mm6 ; Blank mmx7
|
|
||||||
pxor mm7, mm7 ; Blank mmx7
|
|
||||||
|
|
||||||
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
|
||||||
mov rbx, arg(2) ;[ref_ptr]
|
|
||||||
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
|
||||||
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
|
||||||
; Row 1
|
|
||||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 2
|
|
||||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 3
|
|
||||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
add rbx,rdx ; Inc pointer into ref data
|
|
||||||
add rax,rcx ; Inc pointer into the new data
|
|
||||||
movd mm1, [rbx] ; Copy eight bytes to mm1
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
; Row 4
|
|
||||||
movd mm0, [rax] ; Copy eight bytes to mm0
|
|
||||||
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
|
||||||
punpcklbw mm1, mm6
|
|
||||||
psubsw mm0, mm1 ; A-B (low order) to MM0
|
|
||||||
pmaddwd mm0, mm0 ; square and accumulate
|
|
||||||
paddd mm7, mm0 ; accumulate in mm7
|
|
||||||
|
|
||||||
movq mm0, mm7 ;
|
|
||||||
psrlq mm7, 32
|
|
||||||
|
|
||||||
paddd mm0, mm7
|
|
||||||
movq rax, mm0
|
|
||||||
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
pop rbx
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
%define mmx_filter_shift 7
|
%define mmx_filter_shift 7
|
||||||
|
|
||||||
;void vp8_filter_block2d_bil4x4_var_mmx
|
;void vp8_filter_block2d_bil4x4_var_mmx
|
||||||
|
@ -13,393 +13,6 @@
|
|||||||
|
|
||||||
%define xmm_filter_shift 7
|
%define xmm_filter_shift 7
|
||||||
|
|
||||||
;unsigned int vp8_get_mb_ss_sse2
|
|
||||||
;(
|
|
||||||
; short *src_ptr
|
|
||||||
;)
|
|
||||||
global sym(vp8_get_mb_ss_sse2) PRIVATE
|
|
||||||
sym(vp8_get_mb_ss_sse2):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 1
|
|
||||||
GET_GOT rbx
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
sub rsp, 16
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
|
|
||||||
mov rax, arg(0) ;[src_ptr]
|
|
||||||
mov rcx, 8
|
|
||||||
pxor xmm4, xmm4
|
|
||||||
|
|
||||||
.NEXTROW:
|
|
||||||
movdqa xmm0, [rax]
|
|
||||||
movdqa xmm1, [rax+16]
|
|
||||||
movdqa xmm2, [rax+32]
|
|
||||||
movdqa xmm3, [rax+48]
|
|
||||||
pmaddwd xmm0, xmm0
|
|
||||||
pmaddwd xmm1, xmm1
|
|
||||||
pmaddwd xmm2, xmm2
|
|
||||||
pmaddwd xmm3, xmm3
|
|
||||||
|
|
||||||
paddd xmm0, xmm1
|
|
||||||
paddd xmm2, xmm3
|
|
||||||
paddd xmm4, xmm0
|
|
||||||
paddd xmm4, xmm2
|
|
||||||
|
|
||||||
add rax, 0x40
|
|
||||||
dec rcx
|
|
||||||
ja .NEXTROW
|
|
||||||
|
|
||||||
movdqa xmm3,xmm4
|
|
||||||
psrldq xmm4,8
|
|
||||||
paddd xmm4,xmm3
|
|
||||||
movdqa xmm3,xmm4
|
|
||||||
psrldq xmm4,4
|
|
||||||
paddd xmm4,xmm3
|
|
||||||
movq rax,xmm4
|
|
||||||
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
add rsp, 16
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
RESTORE_GOT
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
|
|
||||||
;unsigned int vp8_get16x16var_sse2
|
|
||||||
;(
|
|
||||||
; unsigned char * src_ptr,
|
|
||||||
; int source_stride,
|
|
||||||
; unsigned char * ref_ptr,
|
|
||||||
; int recon_stride,
|
|
||||||
; unsigned int * SSE,
|
|
||||||
; int * Sum
|
|
||||||
;)
|
|
||||||
global sym(vp8_get16x16var_sse2) PRIVATE
|
|
||||||
sym(vp8_get16x16var_sse2):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 6
|
|
||||||
SAVE_XMM 7
|
|
||||||
push rbx
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
mov rsi, arg(0) ;[src_ptr]
|
|
||||||
mov rdi, arg(2) ;[ref_ptr]
|
|
||||||
|
|
||||||
movsxd rax, DWORD PTR arg(1) ;[source_stride]
|
|
||||||
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
|
|
||||||
|
|
||||||
; Prefetch data
|
|
||||||
lea rcx, [rax+rax*2]
|
|
||||||
prefetcht0 [rsi]
|
|
||||||
prefetcht0 [rsi+rax]
|
|
||||||
prefetcht0 [rsi+rax*2]
|
|
||||||
prefetcht0 [rsi+rcx]
|
|
||||||
lea rbx, [rsi+rax*4]
|
|
||||||
prefetcht0 [rbx]
|
|
||||||
prefetcht0 [rbx+rax]
|
|
||||||
prefetcht0 [rbx+rax*2]
|
|
||||||
prefetcht0 [rbx+rcx]
|
|
||||||
|
|
||||||
lea rcx, [rdx+rdx*2]
|
|
||||||
prefetcht0 [rdi]
|
|
||||||
prefetcht0 [rdi+rdx]
|
|
||||||
prefetcht0 [rdi+rdx*2]
|
|
||||||
prefetcht0 [rdi+rcx]
|
|
||||||
lea rbx, [rdi+rdx*4]
|
|
||||||
prefetcht0 [rbx]
|
|
||||||
prefetcht0 [rbx+rdx]
|
|
||||||
prefetcht0 [rbx+rdx*2]
|
|
||||||
prefetcht0 [rbx+rcx]
|
|
||||||
|
|
||||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
|
||||||
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
|
||||||
|
|
||||||
pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
|
|
||||||
mov rcx, 16
|
|
||||||
|
|
||||||
.var16loop:
|
|
||||||
movdqu xmm1, XMMWORD PTR [rsi]
|
|
||||||
movdqu xmm2, XMMWORD PTR [rdi]
|
|
||||||
|
|
||||||
prefetcht0 [rsi+rax*8]
|
|
||||||
prefetcht0 [rdi+rdx*8]
|
|
||||||
|
|
||||||
movdqa xmm3, xmm1
|
|
||||||
movdqa xmm4, xmm2
|
|
||||||
|
|
||||||
|
|
||||||
punpcklbw xmm1, xmm0
|
|
||||||
punpckhbw xmm3, xmm0
|
|
||||||
|
|
||||||
punpcklbw xmm2, xmm0
|
|
||||||
punpckhbw xmm4, xmm0
|
|
||||||
|
|
||||||
|
|
||||||
psubw xmm1, xmm2
|
|
||||||
psubw xmm3, xmm4
|
|
||||||
|
|
||||||
paddw xmm7, xmm1
|
|
||||||
pmaddwd xmm1, xmm1
|
|
||||||
|
|
||||||
paddw xmm7, xmm3
|
|
||||||
pmaddwd xmm3, xmm3
|
|
||||||
|
|
||||||
paddd xmm6, xmm1
|
|
||||||
paddd xmm6, xmm3
|
|
||||||
|
|
||||||
add rsi, rax
|
|
||||||
add rdi, rdx
|
|
||||||
|
|
||||||
sub rcx, 1
|
|
||||||
jnz .var16loop
|
|
||||||
|
|
||||||
|
|
||||||
movdqa xmm1, xmm6
|
|
||||||
pxor xmm6, xmm6
|
|
||||||
|
|
||||||
pxor xmm5, xmm5
|
|
||||||
punpcklwd xmm6, xmm7
|
|
||||||
|
|
||||||
punpckhwd xmm5, xmm7
|
|
||||||
psrad xmm5, 16
|
|
||||||
|
|
||||||
psrad xmm6, 16
|
|
||||||
paddd xmm6, xmm5
|
|
||||||
|
|
||||||
movdqa xmm2, xmm1
|
|
||||||
punpckldq xmm1, xmm0
|
|
||||||
|
|
||||||
punpckhdq xmm2, xmm0
|
|
||||||
movdqa xmm7, xmm6
|
|
||||||
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
punpckldq xmm6, xmm0
|
|
||||||
|
|
||||||
punpckhdq xmm7, xmm0
|
|
||||||
paddd xmm6, xmm7
|
|
||||||
|
|
||||||
movdqa xmm2, xmm1
|
|
||||||
movdqa xmm7, xmm6
|
|
||||||
|
|
||||||
psrldq xmm1, 8
|
|
||||||
psrldq xmm6, 8
|
|
||||||
|
|
||||||
paddd xmm7, xmm6
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
|
|
||||||
mov rax, arg(5) ;[Sum]
|
|
||||||
mov rdi, arg(4) ;[SSE]
|
|
||||||
|
|
||||||
movd DWORD PTR [rax], xmm7
|
|
||||||
movd DWORD PTR [rdi], xmm1
|
|
||||||
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
pop rbx
|
|
||||||
RESTORE_XMM
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
;unsigned int vp8_get8x8var_sse2
|
|
||||||
;(
|
|
||||||
; unsigned char * src_ptr,
|
|
||||||
; int source_stride,
|
|
||||||
; unsigned char * ref_ptr,
|
|
||||||
; int recon_stride,
|
|
||||||
; unsigned int * SSE,
|
|
||||||
; int * Sum
|
|
||||||
;)
|
|
||||||
global sym(vp8_get8x8var_sse2) PRIVATE
|
|
||||||
sym(vp8_get8x8var_sse2):
|
|
||||||
push rbp
|
|
||||||
mov rbp, rsp
|
|
||||||
SHADOW_ARGS_TO_STACK 6
|
|
||||||
SAVE_XMM 7
|
|
||||||
GET_GOT rbx
|
|
||||||
push rsi
|
|
||||||
push rdi
|
|
||||||
sub rsp, 16
|
|
||||||
; end prolog
|
|
||||||
|
|
||||||
mov rsi, arg(0) ;[src_ptr]
|
|
||||||
mov rdi, arg(2) ;[ref_ptr]
|
|
||||||
|
|
||||||
movsxd rax, DWORD PTR arg(1) ;[source_stride]
|
|
||||||
movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
|
|
||||||
|
|
||||||
pxor xmm0, xmm0 ; clear xmm0 for unpack
|
|
||||||
pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
|
|
||||||
|
|
||||||
movq xmm1, QWORD PTR [rsi]
|
|
||||||
movq xmm2, QWORD PTR [rdi]
|
|
||||||
|
|
||||||
punpcklbw xmm1, xmm0
|
|
||||||
punpcklbw xmm2, xmm0
|
|
||||||
|
|
||||||
psubsw xmm1, xmm2
|
|
||||||
paddw xmm7, xmm1
|
|
||||||
|
|
||||||
pmaddwd xmm1, xmm1
|
|
||||||
|
|
||||||
movq xmm2, QWORD PTR[rsi + rax]
|
|
||||||
movq xmm3, QWORD PTR[rdi + rdx]
|
|
||||||
|
|
||||||
punpcklbw xmm2, xmm0
|
|
||||||
punpcklbw xmm3, xmm0
|
|
||||||
|
|
||||||
psubsw xmm2, xmm3
|
|
||||||
paddw xmm7, xmm2
|
|
||||||
|
|
||||||
pmaddwd xmm2, xmm2
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
|
|
||||||
|
|
||||||
movq xmm2, QWORD PTR[rsi + rax * 2]
|
|
||||||
movq xmm3, QWORD PTR[rdi + rdx * 2]
|
|
||||||
|
|
||||||
punpcklbw xmm2, xmm0
|
|
||||||
punpcklbw xmm3, xmm0
|
|
||||||
|
|
||||||
psubsw xmm2, xmm3
|
|
||||||
paddw xmm7, xmm2
|
|
||||||
|
|
||||||
pmaddwd xmm2, xmm2
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
|
|
||||||
|
|
||||||
lea rsi, [rsi + rax * 2]
|
|
||||||
lea rdi, [rdi + rdx * 2]
|
|
||||||
movq xmm2, QWORD PTR[rsi + rax]
|
|
||||||
movq xmm3, QWORD PTR[rdi + rdx]
|
|
||||||
|
|
||||||
punpcklbw xmm2, xmm0
|
|
||||||
punpcklbw xmm3, xmm0
|
|
||||||
|
|
||||||
psubsw xmm2, xmm3
|
|
||||||
paddw xmm7, xmm2
|
|
||||||
|
|
||||||
pmaddwd xmm2, xmm2
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
|
|
||||||
movq xmm2, QWORD PTR[rsi + rax *2]
|
|
||||||
movq xmm3, QWORD PTR[rdi + rdx *2]
|
|
||||||
|
|
||||||
punpcklbw xmm2, xmm0
|
|
||||||
punpcklbw xmm3, xmm0
|
|
||||||
|
|
||||||
psubsw xmm2, xmm3
|
|
||||||
paddw xmm7, xmm2
|
|
||||||
|
|
||||||
pmaddwd xmm2, xmm2
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
|
|
||||||
|
|
||||||
lea rsi, [rsi + rax * 2]
|
|
||||||
lea rdi, [rdi + rdx * 2]
|
|
||||||
|
|
||||||
|
|
||||||
movq xmm2, QWORD PTR[rsi + rax]
|
|
||||||
movq xmm3, QWORD PTR[rdi + rdx]
|
|
||||||
|
|
||||||
punpcklbw xmm2, xmm0
|
|
||||||
punpcklbw xmm3, xmm0
|
|
||||||
|
|
||||||
psubsw xmm2, xmm3
|
|
||||||
paddw xmm7, xmm2
|
|
||||||
|
|
||||||
pmaddwd xmm2, xmm2
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
|
|
||||||
movq xmm2, QWORD PTR[rsi + rax *2]
|
|
||||||
movq xmm3, QWORD PTR[rdi + rdx *2]
|
|
||||||
|
|
||||||
punpcklbw xmm2, xmm0
|
|
||||||
punpcklbw xmm3, xmm0
|
|
||||||
|
|
||||||
psubsw xmm2, xmm3
|
|
||||||
paddw xmm7, xmm2
|
|
||||||
|
|
||||||
pmaddwd xmm2, xmm2
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
|
|
||||||
|
|
||||||
lea rsi, [rsi + rax * 2]
|
|
||||||
lea rdi, [rdi + rdx * 2]
|
|
||||||
|
|
||||||
movq xmm2, QWORD PTR[rsi + rax]
|
|
||||||
movq xmm3, QWORD PTR[rdi + rdx]
|
|
||||||
|
|
||||||
punpcklbw xmm2, xmm0
|
|
||||||
punpcklbw xmm3, xmm0
|
|
||||||
|
|
||||||
psubsw xmm2, xmm3
|
|
||||||
paddw xmm7, xmm2
|
|
||||||
|
|
||||||
pmaddwd xmm2, xmm2
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
|
|
||||||
|
|
||||||
movdqa xmm6, xmm7
|
|
||||||
punpcklwd xmm6, xmm0
|
|
||||||
|
|
||||||
punpckhwd xmm7, xmm0
|
|
||||||
movdqa xmm2, xmm1
|
|
||||||
|
|
||||||
paddw xmm6, xmm7
|
|
||||||
punpckldq xmm1, xmm0
|
|
||||||
|
|
||||||
punpckhdq xmm2, xmm0
|
|
||||||
movdqa xmm7, xmm6
|
|
||||||
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
punpckldq xmm6, xmm0
|
|
||||||
|
|
||||||
punpckhdq xmm7, xmm0
|
|
||||||
paddw xmm6, xmm7
|
|
||||||
|
|
||||||
movdqa xmm2, xmm1
|
|
||||||
movdqa xmm7, xmm6
|
|
||||||
|
|
||||||
psrldq xmm1, 8
|
|
||||||
psrldq xmm6, 8
|
|
||||||
|
|
||||||
paddw xmm7, xmm6
|
|
||||||
paddd xmm1, xmm2
|
|
||||||
|
|
||||||
mov rax, arg(5) ;[Sum]
|
|
||||||
mov rdi, arg(4) ;[SSE]
|
|
||||||
|
|
||||||
movq rdx, xmm7
|
|
||||||
movsx rcx, dx
|
|
||||||
|
|
||||||
mov dword ptr [rax], ecx
|
|
||||||
movd DWORD PTR [rdi], xmm1
|
|
||||||
|
|
||||||
; begin epilog
|
|
||||||
add rsp, 16
|
|
||||||
pop rdi
|
|
||||||
pop rsi
|
|
||||||
RESTORE_GOT
|
|
||||||
RESTORE_XMM
|
|
||||||
UNSHADOW_ARGS
|
|
||||||
pop rbp
|
|
||||||
ret
|
|
||||||
|
|
||||||
;void vp8_filter_block2d_bil_var_sse2
|
;void vp8_filter_block2d_bil_var_sse2
|
||||||
;(
|
;(
|
||||||
; unsigned char *ref_ptr,
|
; unsigned char *ref_ptr,
|
||||||
|
@ -35,25 +35,6 @@ extern void filter_block1d_v6_mmx
|
|||||||
short *filter
|
short *filter
|
||||||
);
|
);
|
||||||
|
|
||||||
extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
|
|
||||||
extern unsigned int vp8_get8x8var_mmx
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *SSE,
|
|
||||||
int *Sum
|
|
||||||
);
|
|
||||||
extern unsigned int vp8_get4x4var_mmx
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *SSE,
|
|
||||||
int *Sum
|
|
||||||
);
|
|
||||||
extern void vp8_filter_block2d_bil4x4_var_mmx
|
extern void vp8_filter_block2d_bil4x4_var_mmx
|
||||||
(
|
(
|
||||||
const unsigned char *ref_ptr,
|
const unsigned char *ref_ptr,
|
||||||
@ -78,127 +59,6 @@ extern void vp8_filter_block2d_bil_var_mmx
|
|||||||
unsigned int *sumsquared
|
unsigned int *sumsquared
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_variance4x4_mmx(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 4));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance8x8_mmx(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
|
||||||
*sse = var;
|
|
||||||
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 6));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_mse16x16_mmx(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int sse0, sse1, sse2, sse3, var;
|
|
||||||
int sum0, sum1, sum2, sum3;
|
|
||||||
|
|
||||||
|
|
||||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
|
||||||
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
|
|
||||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
|
|
||||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
|
|
||||||
|
|
||||||
var = sse0 + sse1 + sse2 + sse3;
|
|
||||||
*sse = var;
|
|
||||||
return var;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_variance16x16_mmx(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int sse0, sse1, sse2, sse3, var;
|
|
||||||
int sum0, sum1, sum2, sum3, avg;
|
|
||||||
|
|
||||||
|
|
||||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
|
||||||
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
|
|
||||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
|
|
||||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
|
|
||||||
|
|
||||||
var = sse0 + sse1 + sse2 + sse3;
|
|
||||||
avg = sum0 + sum1 + sum2 + sum3;
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 8));
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance16x8_mmx(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int sse0, sse1, var;
|
|
||||||
int sum0, sum1, avg;
|
|
||||||
|
|
||||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
|
||||||
vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
|
|
||||||
|
|
||||||
var = sse0 + sse1;
|
|
||||||
avg = sum0 + sum1;
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 7));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_variance8x16_mmx(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int sse0, sse1, var;
|
|
||||||
int sum0, sum1, avg;
|
|
||||||
|
|
||||||
vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
|
||||||
vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
|
|
||||||
|
|
||||||
var = sse0 + sse1;
|
|
||||||
avg = sum0 + sum1;
|
|
||||||
*sse = var;
|
|
||||||
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 7));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_sub_pixel_variance4x4_mmx
|
unsigned int vp8_sub_pixel_variance4x4_mmx
|
||||||
(
|
(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
|
@ -31,38 +31,6 @@ extern void vp8_filter_block2d_bil4x4_var_mmx
|
|||||||
unsigned int *sumsquared
|
unsigned int *sumsquared
|
||||||
);
|
);
|
||||||
|
|
||||||
extern unsigned int vp8_get4x4var_mmx
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *SSE,
|
|
||||||
int *Sum
|
|
||||||
);
|
|
||||||
|
|
||||||
unsigned int vp8_get_mb_ss_sse2
|
|
||||||
(
|
|
||||||
const short *src_ptr
|
|
||||||
);
|
|
||||||
unsigned int vp8_get16x16var_sse2
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *SSE,
|
|
||||||
int *Sum
|
|
||||||
);
|
|
||||||
unsigned int vp8_get8x8var_sse2
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *SSE,
|
|
||||||
int *Sum
|
|
||||||
);
|
|
||||||
void vp8_filter_block2d_bil_var_sse2
|
void vp8_filter_block2d_bil_var_sse2
|
||||||
(
|
(
|
||||||
const unsigned char *ref_ptr,
|
const unsigned char *ref_ptr,
|
||||||
@ -136,115 +104,6 @@ void vp8_half_vert_variance16x_h_sse2
|
|||||||
unsigned int *sumsquared
|
unsigned int *sumsquared
|
||||||
);
|
);
|
||||||
|
|
||||||
unsigned int vp8_variance4x4_wmt(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 4));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance8x8_wmt
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int var;
|
|
||||||
int avg;
|
|
||||||
|
|
||||||
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 6));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_variance16x16_wmt
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int sse0;
|
|
||||||
int sum0;
|
|
||||||
|
|
||||||
|
|
||||||
vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
|
||||||
*sse = sse0;
|
|
||||||
return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
|
|
||||||
}
|
|
||||||
unsigned int vp8_mse16x16_wmt(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
|
|
||||||
unsigned int sse0;
|
|
||||||
int sum0;
|
|
||||||
vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
|
||||||
*sse = sse0;
|
|
||||||
return sse0;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_variance16x8_wmt
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int sse0, sse1, var;
|
|
||||||
int sum0, sum1, avg;
|
|
||||||
|
|
||||||
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
|
||||||
vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
|
|
||||||
|
|
||||||
var = sse0 + sse1;
|
|
||||||
avg = sum0 + sum1;
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 7));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_variance8x16_wmt
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse)
|
|
||||||
{
|
|
||||||
unsigned int sse0, sse1, var;
|
|
||||||
int sum0, sum1, avg;
|
|
||||||
|
|
||||||
vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
|
|
||||||
vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
|
|
||||||
|
|
||||||
var = sse0 + sse1;
|
|
||||||
avg = sum0 + sum1;
|
|
||||||
*sse = var;
|
|
||||||
return (var - (((unsigned int)avg * avg) >> 7));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_sub_pixel_variance4x4_wmt
|
unsigned int vp8_sub_pixel_variance4x4_wmt
|
||||||
(
|
(
|
||||||
const unsigned char *src_ptr,
|
const unsigned char *src_ptr,
|
||||||
|
@ -13,15 +13,6 @@
|
|||||||
#include "vp8/common/variance.h"
|
#include "vp8/common/variance.h"
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
extern unsigned int vp8_get16x16var_sse2
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *SSE,
|
|
||||||
int *Sum
|
|
||||||
);
|
|
||||||
extern void vp8_half_horiz_vert_variance16x_h_sse2
|
extern void vp8_half_horiz_vert_variance16x_h_sse2
|
||||||
(
|
(
|
||||||
const unsigned char *ref_ptr,
|
const unsigned char *ref_ptr,
|
||||||
|
@ -1,138 +0,0 @@
|
|||||||
;
|
|
||||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
|
||||||
;
|
|
||||||
; Use of this source code is governed by a BSD-style license
|
|
||||||
; that can be found in the LICENSE file in the root of the source
|
|
||||||
; tree. An additional intellectual property rights grant can be found
|
|
||||||
; in the file PATENTS. All contributing project authors may
|
|
||||||
; be found in the AUTHORS file in the root of the source tree.
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
EXPORT |vp8_mse16x16_armv6|
|
|
||||||
|
|
||||||
ARM
|
|
||||||
|
|
||||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
|
||||||
|
|
||||||
; r0 unsigned char *src_ptr
|
|
||||||
; r1 int source_stride
|
|
||||||
; r2 unsigned char *ref_ptr
|
|
||||||
; r3 int recon_stride
|
|
||||||
; stack unsigned int *sse
|
|
||||||
;
|
|
||||||
;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
|
|
||||||
; So, we can remove this part of calculation.
|
|
||||||
|
|
||||||
|vp8_mse16x16_armv6| PROC
|
|
||||||
|
|
||||||
push {r4-r9, lr}
|
|
||||||
|
|
||||||
pld [r0, r1, lsl #0]
|
|
||||||
pld [r2, r3, lsl #0]
|
|
||||||
|
|
||||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
|
||||||
mov r4, #0 ; initialize sse = 0
|
|
||||||
|
|
||||||
loop
|
|
||||||
; 1st 4 pixels
|
|
||||||
ldr r5, [r0, #0x0] ; load 4 src pixels
|
|
||||||
ldr r6, [r2, #0x0] ; load 4 ref pixels
|
|
||||||
|
|
||||||
mov lr, #0 ; constant zero
|
|
||||||
|
|
||||||
usub8 r8, r5, r6 ; calculate difference
|
|
||||||
pld [r0, r1, lsl #1]
|
|
||||||
sel r7, r8, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
|
||||||
pld [r2, r3, lsl #1]
|
|
||||||
sel r8, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
|
||||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
|
||||||
orr r8, r8, r7 ; differences of all 4 pixels
|
|
||||||
|
|
||||||
ldr r5, [r0, #0x4] ; load 4 src pixels
|
|
||||||
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
|
||||||
|
|
||||||
; 2nd 4 pixels
|
|
||||||
ldr r6, [r2, #0x4] ; load 4 ref pixels
|
|
||||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
usub8 r8, r5, r6 ; calculate difference
|
|
||||||
sel r7, r8, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
|
||||||
sel r8, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
|
||||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
|
||||||
orr r8, r8, r7 ; differences of all 4 pixels
|
|
||||||
ldr r5, [r0, #0x8] ; load 4 src pixels
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
|
||||||
|
|
||||||
; 3rd 4 pixels
|
|
||||||
ldr r6, [r2, #0x8] ; load 4 ref pixels
|
|
||||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
usub8 r8, r5, r6 ; calculate difference
|
|
||||||
sel r7, r8, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
|
||||||
sel r8, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
|
||||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
|
||||||
orr r8, r8, r7 ; differences of all 4 pixels
|
|
||||||
|
|
||||||
ldr r5, [r0, #0xc] ; load 4 src pixels
|
|
||||||
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
|
||||||
|
|
||||||
; 4th 4 pixels
|
|
||||||
ldr r6, [r2, #0xc] ; load 4 ref pixels
|
|
||||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
usub8 r8, r5, r6 ; calculate difference
|
|
||||||
add r0, r0, r1 ; set src_ptr to next row
|
|
||||||
sel r7, r8, lr ; select bytes with positive difference
|
|
||||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
|
||||||
add r2, r2, r3 ; set dst_ptr to next row
|
|
||||||
sel r8, r9, lr ; select bytes with negative difference
|
|
||||||
|
|
||||||
; calculate partial sums
|
|
||||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
|
||||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
|
||||||
orr r8, r8, r7 ; differences of all 4 pixels
|
|
||||||
|
|
||||||
subs r12, r12, #1 ; next row
|
|
||||||
|
|
||||||
; calculate sse
|
|
||||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
|
||||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
|
||||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
|
||||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
|
||||||
|
|
||||||
bne loop
|
|
||||||
|
|
||||||
; return stuff
|
|
||||||
ldr r1, [sp, #28] ; get address of sse
|
|
||||||
mov r0, r4 ; return sse
|
|
||||||
str r4, [r1] ; store sse
|
|
||||||
|
|
||||||
pop {r4-r9, pc}
|
|
||||||
|
|
||||||
ENDP
|
|
||||||
|
|
||||||
END
|
|
@ -1,131 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <arm_neon.h>
|
|
||||||
|
|
||||||
unsigned int vp8_mse16x16_neon(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int i;
|
|
||||||
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
|
||||||
int64x1_t d0s64;
|
|
||||||
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
|
||||||
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
|
||||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
|
||||||
int64x2_t q1s64;
|
|
||||||
|
|
||||||
q7s32 = vdupq_n_s32(0);
|
|
||||||
q8s32 = vdupq_n_s32(0);
|
|
||||||
q9s32 = vdupq_n_s32(0);
|
|
||||||
q10s32 = vdupq_n_s32(0);
|
|
||||||
|
|
||||||
for (i = 0; i < 8; i++) { // mse16x16_neon_loop
|
|
||||||
q0u8 = vld1q_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
q1u8 = vld1q_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
q2u8 = vld1q_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
q3u8 = vld1q_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
|
|
||||||
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
|
||||||
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
|
||||||
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
|
||||||
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
|
||||||
|
|
||||||
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
|
||||||
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
|
||||||
q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
|
|
||||||
q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
|
|
||||||
|
|
||||||
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
|
||||||
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
|
||||||
|
|
||||||
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
|
||||||
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
|
||||||
q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
|
|
||||||
q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
|
|
||||||
|
|
||||||
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
|
||||||
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
|
||||||
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
|
||||||
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
|
||||||
}
|
|
||||||
|
|
||||||
q7s32 = vaddq_s32(q7s32, q8s32);
|
|
||||||
q9s32 = vaddq_s32(q9s32, q10s32);
|
|
||||||
q10s32 = vaddq_s32(q7s32, q9s32);
|
|
||||||
|
|
||||||
q1s64 = vpaddlq_s32(q10s32);
|
|
||||||
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
|
||||||
|
|
||||||
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
|
|
||||||
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp8_get4x4sse_cs_neon(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride) {
|
|
||||||
int16x4_t d22s16, d24s16, d26s16, d28s16;
|
|
||||||
int64x1_t d0s64;
|
|
||||||
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
|
||||||
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
|
||||||
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
|
||||||
int64x2_t q1s64;
|
|
||||||
|
|
||||||
d0u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
d4u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
d1u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
d5u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
d2u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
d6u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
d3u8 = vld1_u8(src_ptr);
|
|
||||||
src_ptr += source_stride;
|
|
||||||
d7u8 = vld1_u8(ref_ptr);
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
|
|
||||||
q11u16 = vsubl_u8(d0u8, d4u8);
|
|
||||||
q12u16 = vsubl_u8(d1u8, d5u8);
|
|
||||||
q13u16 = vsubl_u8(d2u8, d6u8);
|
|
||||||
q14u16 = vsubl_u8(d3u8, d7u8);
|
|
||||||
|
|
||||||
d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
|
|
||||||
d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
|
|
||||||
d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
|
|
||||||
d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
|
|
||||||
|
|
||||||
q7s32 = vmull_s16(d22s16, d22s16);
|
|
||||||
q8s32 = vmull_s16(d24s16, d24s16);
|
|
||||||
q9s32 = vmull_s16(d26s16, d26s16);
|
|
||||||
q10s32 = vmull_s16(d28s16, d28s16);
|
|
||||||
|
|
||||||
q7s32 = vaddq_s32(q7s32, q8s32);
|
|
||||||
q9s32 = vaddq_s32(q9s32, q10s32);
|
|
||||||
q9s32 = vaddq_s32(q7s32, q9s32);
|
|
||||||
|
|
||||||
q1s64 = vpaddlq_s32(q9s32);
|
|
||||||
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
|
||||||
|
|
||||||
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
|
||||||
}
|
|
@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
#include "vpx_config.h"
|
#include "vpx_config.h"
|
||||||
#include "vp8_rtcd.h"
|
#include "vp8_rtcd.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "encodemb.h"
|
#include "encodemb.h"
|
||||||
#include "encodemv.h"
|
#include "encodemv.h"
|
||||||
#include "vp8/common/common.h"
|
#include "vp8/common/common.h"
|
||||||
@ -90,7 +91,7 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
|
|||||||
* lambda using a non-linear combination (e.g., the smallest, or second
|
* lambda using a non-linear combination (e.g., the smallest, or second
|
||||||
* smallest, etc.).
|
* smallest, etc.).
|
||||||
*/
|
*/
|
||||||
act = vp8_variance16x16(x->src.y_buffer,
|
act = vpx_variance16x16(x->src.y_buffer,
|
||||||
x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
|
x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
|
||||||
act = act<<4;
|
act = act<<4;
|
||||||
|
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
#include "vpx_config.h"
|
#include "vpx_config.h"
|
||||||
#include "vp8_rtcd.h"
|
#include "vp8_rtcd.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "quantize.h"
|
#include "quantize.h"
|
||||||
#include "vp8/common/reconintra4x4.h"
|
#include "vp8/common/reconintra4x4.h"
|
||||||
#include "encodemb.h"
|
#include "encodemb.h"
|
||||||
@ -44,7 +45,7 @@ int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
intra_pred_var = vp8_get_mb_ss(x->src_diff);
|
intra_pred_var = vpx_get_mb_ss(x->src_diff);
|
||||||
|
|
||||||
return intra_pred_var;
|
return intra_pred_var;
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "./vpx_scale_rtcd.h"
|
#include "./vpx_scale_rtcd.h"
|
||||||
#include "block.h"
|
#include "block.h"
|
||||||
#include "onyx_int.h"
|
#include "onyx_int.h"
|
||||||
@ -422,14 +423,14 @@ static void zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x,
|
|||||||
/* Set up pointers for this macro block raw buffer */
|
/* Set up pointers for this macro block raw buffer */
|
||||||
raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
|
raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
|
||||||
+ d->offset);
|
+ d->offset);
|
||||||
vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
|
vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride,
|
||||||
(unsigned int *)(raw_motion_err));
|
(unsigned int *)(raw_motion_err));
|
||||||
|
|
||||||
/* Set up pointers for this macro block recon buffer */
|
/* Set up pointers for this macro block recon buffer */
|
||||||
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
|
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
|
||||||
ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
|
ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
|
||||||
vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
|
vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
|
||||||
(unsigned int *)(best_motion_err));
|
(unsigned int *)(best_motion_err));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
|
static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
|
||||||
@ -453,7 +454,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
|
|||||||
int new_mv_mode_penalty = 256;
|
int new_mv_mode_penalty = 256;
|
||||||
|
|
||||||
/* override the default variance function to use MSE */
|
/* override the default variance function to use MSE */
|
||||||
v_fn_ptr.vf = vp8_mse16x16;
|
v_fn_ptr.vf = vpx_mse16x16;
|
||||||
|
|
||||||
/* Set up pointers for this macro block recon buffer */
|
/* Set up pointers for this macro block recon buffer */
|
||||||
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
|
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
|
||||||
|
@ -2131,7 +2131,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
|
cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
|
||||||
cpi->fn_ptr[BLOCK_16X16].vf = vp8_variance16x16;
|
cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
|
||||||
cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
|
cpi->fn_ptr[BLOCK_16X16].svf = vp8_sub_pixel_variance16x16;
|
||||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
|
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h = vp8_variance_halfpixvar16x16_h;
|
||||||
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
|
cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = vp8_variance_halfpixvar16x16_v;
|
||||||
@ -2141,7 +2141,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||||||
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
|
cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
|
||||||
|
|
||||||
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
|
cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
|
||||||
cpi->fn_ptr[BLOCK_16X8].vf = vp8_variance16x8;
|
cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
|
||||||
cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
|
cpi->fn_ptr[BLOCK_16X8].svf = vp8_sub_pixel_variance16x8;
|
||||||
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
|
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h = NULL;
|
||||||
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
|
cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL;
|
||||||
@ -2151,7 +2151,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||||||
cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
|
cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
|
||||||
|
|
||||||
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
|
cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
|
||||||
cpi->fn_ptr[BLOCK_8X16].vf = vp8_variance8x16;
|
cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
|
||||||
cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
|
cpi->fn_ptr[BLOCK_8X16].svf = vp8_sub_pixel_variance8x16;
|
||||||
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
|
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h = NULL;
|
||||||
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
|
cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL;
|
||||||
@ -2161,7 +2161,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||||||
cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
|
cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
|
||||||
|
|
||||||
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
|
cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
|
||||||
cpi->fn_ptr[BLOCK_8X8].vf = vp8_variance8x8;
|
cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
|
||||||
cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
|
cpi->fn_ptr[BLOCK_8X8].svf = vp8_sub_pixel_variance8x8;
|
||||||
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
|
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h = NULL;
|
||||||
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
|
cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL;
|
||||||
@ -2171,7 +2171,7 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
|
|||||||
cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
|
cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
|
||||||
|
|
||||||
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
|
cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
|
||||||
cpi->fn_ptr[BLOCK_4X4].vf = vp8_variance4x4;
|
cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
|
||||||
cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
|
cpi->fn_ptr[BLOCK_4X4].svf = vp8_sub_pixel_variance4x4;
|
||||||
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
|
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h = NULL;
|
||||||
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
|
cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL;
|
||||||
@ -2558,7 +2558,7 @@ static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
|
|||||||
{
|
{
|
||||||
unsigned int sse;
|
unsigned int sse;
|
||||||
|
|
||||||
vp8_mse16x16(orig + col, orig_stride,
|
vpx_mse16x16(orig + col, orig_stride,
|
||||||
recon + col, recon_stride,
|
recon + col, recon_stride,
|
||||||
&sse);
|
&sse);
|
||||||
total_sse += sse;
|
total_sse += sse;
|
||||||
@ -3384,7 +3384,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
|
|||||||
int index = block_index_row + (j >> 4);
|
int index = block_index_row + (j >> 4);
|
||||||
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
|
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
|
||||||
unsigned int sse;
|
unsigned int sse;
|
||||||
Total += vp8_mse16x16(src + j,
|
Total += vpx_mse16x16(src + j,
|
||||||
source->y_stride,
|
source->y_stride,
|
||||||
dst + j, dest->y_stride,
|
dst + j, dest->y_stride,
|
||||||
&sse);
|
&sse);
|
||||||
@ -3448,7 +3448,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
|
|||||||
int index = block_index_row + (j >> 4);
|
int index = block_index_row + (j >> 4);
|
||||||
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
|
if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
|
||||||
unsigned int sse;
|
unsigned int sse;
|
||||||
const unsigned int var = vp8_variance16x16(src + j,
|
const unsigned int var = vpx_variance16x16(src + j,
|
||||||
ystride,
|
ystride,
|
||||||
dst + j,
|
dst + j,
|
||||||
ystride,
|
ystride,
|
||||||
@ -3458,7 +3458,7 @@ static void process_denoiser_mode_change(VP8_COMP *cpi) {
|
|||||||
// is small (to avoid effects from lighting change).
|
// is small (to avoid effects from lighting change).
|
||||||
if ((sse - var) < 128) {
|
if ((sse - var) < 128) {
|
||||||
unsigned int sse2;
|
unsigned int sse2;
|
||||||
const unsigned int act = vp8_variance16x16(src + j,
|
const unsigned int act = vpx_variance16x16(src + j,
|
||||||
ystride,
|
ystride,
|
||||||
const_source,
|
const_source,
|
||||||
0,
|
0,
|
||||||
@ -5993,7 +5993,8 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest)
|
|||||||
for (j = 0; j < source->y_width; j += 16)
|
for (j = 0; j < source->y_width; j += 16)
|
||||||
{
|
{
|
||||||
unsigned int sse;
|
unsigned int sse;
|
||||||
Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
|
Total += vpx_mse16x16(src + j, source->y_stride,
|
||||||
|
dst + j, dest->y_stride, &sse);
|
||||||
}
|
}
|
||||||
|
|
||||||
src += 16 * source->y_stride;
|
src += 16 * source->y_stride;
|
||||||
|
@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include "vpx_config.h"
|
#include "vpx_config.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "onyx_int.h"
|
#include "onyx_int.h"
|
||||||
#include "modecosts.h"
|
#include "modecosts.h"
|
||||||
#include "encodeintra.h"
|
#include "encodeintra.h"
|
||||||
@ -215,33 +216,6 @@ int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp8_get4x4sse_cs_c
|
|
||||||
(
|
|
||||||
const unsigned char *src_ptr,
|
|
||||||
int source_stride,
|
|
||||||
const unsigned char *ref_ptr,
|
|
||||||
int recon_stride
|
|
||||||
)
|
|
||||||
{
|
|
||||||
int distortion = 0;
|
|
||||||
int r, c;
|
|
||||||
|
|
||||||
for (r = 0; r < 4; r++)
|
|
||||||
{
|
|
||||||
for (c = 0; c < 4; c++)
|
|
||||||
{
|
|
||||||
int diff = src_ptr[c] - ref_ptr[c];
|
|
||||||
distortion += diff * diff;
|
|
||||||
}
|
|
||||||
|
|
||||||
src_ptr += source_stride;
|
|
||||||
ref_ptr += recon_stride;
|
|
||||||
}
|
|
||||||
|
|
||||||
return distortion;
|
|
||||||
}
|
|
||||||
|
|
||||||
static int get_prediction_error(BLOCK *be, BLOCKD *b)
|
static int get_prediction_error(BLOCK *be, BLOCKD *b)
|
||||||
{
|
{
|
||||||
unsigned char *sptr;
|
unsigned char *sptr;
|
||||||
@ -249,7 +223,7 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b)
|
|||||||
sptr = (*(be->base_src) + be->src);
|
sptr = (*(be->base_src) + be->src);
|
||||||
dptr = b->predictor;
|
dptr = b->predictor;
|
||||||
|
|
||||||
return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
|
return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1037,7 +1011,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
rate2 += rate;
|
rate2 += rate;
|
||||||
distortion2 = vp8_variance16x16(
|
distortion2 = vpx_variance16x16(
|
||||||
*(b->base_src), b->src_stride,
|
*(b->base_src), b->src_stride,
|
||||||
x->e_mbd.predictor, 16, &sse);
|
x->e_mbd.predictor, 16, &sse);
|
||||||
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
|
this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
|
||||||
@ -1066,7 +1040,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
|
|||||||
xd->dst.y_stride,
|
xd->dst.y_stride,
|
||||||
xd->predictor,
|
xd->predictor,
|
||||||
16);
|
16);
|
||||||
distortion2 = vp8_variance16x16
|
distortion2 = vpx_variance16x16
|
||||||
(*(b->base_src), b->src_stride,
|
(*(b->base_src), b->src_stride,
|
||||||
x->e_mbd.predictor, 16, &sse);
|
x->e_mbd.predictor, 16, &sse);
|
||||||
rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
|
rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
|
||||||
@ -1547,7 +1521,7 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_)
|
|||||||
xd->dst.y_stride,
|
xd->dst.y_stride,
|
||||||
xd->predictor,
|
xd->predictor,
|
||||||
16);
|
16);
|
||||||
distortion = vp8_variance16x16
|
distortion = vpx_variance16x16
|
||||||
(*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
|
(*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
|
||||||
rate = x->mbmode_cost[xd->frame_type][mode];
|
rate = x->mbmode_cost[xd->frame_type][mode];
|
||||||
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
|
this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "./vpx_scale_rtcd.h"
|
#include "./vpx_scale_rtcd.h"
|
||||||
#include "vp8/common/onyxc_int.h"
|
#include "vp8/common/onyxc_int.h"
|
||||||
#include "onyx_int.h"
|
#include "onyx_int.h"
|
||||||
@ -83,7 +84,7 @@ static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source,
|
|||||||
for (j = 0; j < source->y_width; j += 16)
|
for (j = 0; j < source->y_width; j += 16)
|
||||||
{
|
{
|
||||||
unsigned int sse;
|
unsigned int sse;
|
||||||
Total += vp8_mse16x16(src + j, source->y_stride,
|
Total += vpx_mse16x16(src + j, source->y_stride,
|
||||||
dst + j, dest->y_stride,
|
dst + j, dest->y_stride,
|
||||||
&sse);
|
&sse);
|
||||||
}
|
}
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include "vpx_config.h"
|
#include "vpx_config.h"
|
||||||
#include "vp8_rtcd.h"
|
#include "vp8_rtcd.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "tokenize.h"
|
#include "tokenize.h"
|
||||||
#include "treewriter.h"
|
#include "treewriter.h"
|
||||||
#include "onyx_int.h"
|
#include "onyx_int.h"
|
||||||
@ -507,9 +508,9 @@ int VP8_UVSSE(MACROBLOCK *x)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
vp8_variance8x8(uptr, pre_stride,
|
vpx_variance8x8(uptr, pre_stride,
|
||||||
upred_ptr, uv_stride, &sse2);
|
upred_ptr, uv_stride, &sse2);
|
||||||
vp8_variance8x8(vptr, pre_stride,
|
vpx_variance8x8(vptr, pre_stride,
|
||||||
vpred_ptr, uv_stride, &sse1);
|
vpred_ptr, uv_stride, &sse1);
|
||||||
sse2 += sse1;
|
sse2 += sse1;
|
||||||
}
|
}
|
||||||
@ -1783,7 +1784,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4],
|
|||||||
if(threshold < x->encode_breakout)
|
if(threshold < x->encode_breakout)
|
||||||
threshold = x->encode_breakout;
|
threshold = x->encode_breakout;
|
||||||
|
|
||||||
var = vp8_variance16x16
|
var = vpx_variance16x16
|
||||||
(*(b->base_src), b->src_stride,
|
(*(b->base_src), b->src_stride,
|
||||||
x->e_mbd.predictor, 16, &sse);
|
x->e_mbd.predictor, 16, &sse);
|
||||||
|
|
||||||
|
@ -145,8 +145,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/intra4x4_predict_v6$(ASM)
|
|||||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
|
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
|
||||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
|
|
||||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
|
|
||||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
|
||||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
|
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
|
||||||
@ -168,7 +166,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c
|
|||||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
|
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/reconintra_neon.c
|
||||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
|
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c
|
||||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
|
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
|
||||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/variance_neon.c
|
|
||||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
|
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance_neon.c
|
||||||
|
|
||||||
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
|
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))
|
||||||
|
@ -18,7 +18,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
|
|||||||
#File list for media
|
#File list for media
|
||||||
# encoder
|
# encoder
|
||||||
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
|
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
|
||||||
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
|
|
||||||
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
|
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
|
||||||
|
|
||||||
#File list for neon
|
#File list for neon
|
||||||
@ -27,5 +26,4 @@ VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
|
|||||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
|
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
|
||||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
|
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
|
||||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
|
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c
|
||||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c
|
|
||||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
|
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
|
||||||
|
@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
|
|||||||
get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
|
get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
|
||||||
|
|
||||||
if (bs == BLOCK_16X16) {
|
if (bs == BLOCK_16X16) {
|
||||||
vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
|
vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
|
||||||
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
|
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
|
||||||
} else if (bs == BLOCK_32X32) {
|
} else if (bs == BLOCK_32X32) {
|
||||||
vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
|
vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
|
||||||
sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
|
sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
|
||||||
} else /* if (bs == BLOCK_64X64) */ {
|
} else /* if (bs == BLOCK_64X64) */ {
|
||||||
vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
|
vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
|
||||||
sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
|
sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
|
|||||||
|
|
||||||
|
|
||||||
# variance
|
# variance
|
||||||
add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance16x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance16x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance8x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
|
||||||
specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
|
||||||
specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance8x4/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance4x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_variance4x4/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
|
specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
|
||||||
|
|
||||||
@ -922,21 +877,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
|
|||||||
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||||
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
|
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_mse8x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_mse16x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_mse8x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
|
|
||||||
specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
|
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
|
||||||
specialize qw/vp9_avg_8x8 sse2 neon/;
|
specialize qw/vp9_avg_8x8 sse2 neon/;
|
||||||
|
|
||||||
@ -1141,142 +1081,6 @@ specialize qw/vp9_temporal_filter_apply sse2/;
|
|||||||
|
|
||||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||||
|
|
||||||
# variance
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance8x4/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance4x8/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_variance4x4/;
|
|
||||||
|
|
||||||
add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
|
||||||
specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
|
||||||
specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance8x4/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance4x8/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_variance4x4/;
|
|
||||||
|
|
||||||
add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
|
||||||
specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
|
||||||
specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance8x4/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance4x8/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_variance4x4/;
|
|
||||||
|
|
||||||
add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
|
||||||
specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
|
||||||
specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
|
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
|
||||||
|
|
||||||
@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||||||
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
|
||||||
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
|
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_mse8x16/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_mse16x8/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_mse8x16/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_mse16x8/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_mse8x16/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_mse16x8/;
|
|
||||||
|
|
||||||
add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
|
||||||
specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
|
|
||||||
|
|
||||||
# ENCODEMB INVOKE
|
# ENCODEMB INVOKE
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#include "./vp9_rtcd.h"
|
#include "./vp9_rtcd.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "./vpx_config.h"
|
#include "./vpx_config.h"
|
||||||
|
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
@ -20,82 +21,6 @@
|
|||||||
|
|
||||||
#include "vp9/encoder/vp9_variance.h"
|
#include "vp9/encoder/vp9_variance.h"
|
||||||
|
|
||||||
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
|
|
||||||
const int32x4_t a = vpaddlq_s16(v_16x8);
|
|
||||||
const int64x2_t b = vpaddlq_s32(a);
|
|
||||||
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
|
|
||||||
vreinterpret_s32_s64(vget_high_s64(b)));
|
|
||||||
return vget_lane_s32(c, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
|
|
||||||
const int64x2_t b = vpaddlq_s32(v_32x4);
|
|
||||||
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
|
|
||||||
vreinterpret_s32_s64(vget_high_s64(b)));
|
|
||||||
return vget_lane_s32(c, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// w * h must be less than 2048 or local variable v_sum may overflow.
|
|
||||||
static void variance_neon_w8(const uint8_t *a, int a_stride,
|
|
||||||
const uint8_t *b, int b_stride,
|
|
||||||
int w, int h, uint32_t *sse, int *sum) {
|
|
||||||
int i, j;
|
|
||||||
int16x8_t v_sum = vdupq_n_s16(0);
|
|
||||||
int32x4_t v_sse_lo = vdupq_n_s32(0);
|
|
||||||
int32x4_t v_sse_hi = vdupq_n_s32(0);
|
|
||||||
|
|
||||||
for (i = 0; i < h; ++i) {
|
|
||||||
for (j = 0; j < w; j += 8) {
|
|
||||||
const uint8x8_t v_a = vld1_u8(&a[j]);
|
|
||||||
const uint8x8_t v_b = vld1_u8(&b[j]);
|
|
||||||
const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
|
|
||||||
const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
|
|
||||||
v_sum = vaddq_s16(v_sum, sv_diff);
|
|
||||||
v_sse_lo = vmlal_s16(v_sse_lo,
|
|
||||||
vget_low_s16(sv_diff),
|
|
||||||
vget_low_s16(sv_diff));
|
|
||||||
v_sse_hi = vmlal_s16(v_sse_hi,
|
|
||||||
vget_high_s16(sv_diff),
|
|
||||||
vget_high_s16(sv_diff));
|
|
||||||
}
|
|
||||||
a += a_stride;
|
|
||||||
b += b_stride;
|
|
||||||
}
|
|
||||||
|
|
||||||
*sum = horizontal_add_s16x8(v_sum);
|
|
||||||
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
|
|
||||||
const uint8_t *ref_ptr, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum) {
|
|
||||||
variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
|
|
||||||
8, sse, sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
|
|
||||||
const uint8_t *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
|
|
||||||
const uint8_t *ref_ptr, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum) {
|
|
||||||
variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
|
|
||||||
16, sse, sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
|
|
||||||
const uint8_t *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
|
|
||||||
}
|
|
||||||
|
|
||||||
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
|
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
|
||||||
uint8_t *output_ptr,
|
uint8_t *output_ptr,
|
||||||
unsigned int src_pixels_per_line,
|
unsigned int src_pixels_per_line,
|
||||||
@ -162,7 +87,7 @@ unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
|
|||||||
BILINEAR_FILTERS_2TAP(xoffset));
|
BILINEAR_FILTERS_2TAP(xoffset));
|
||||||
var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
|
var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
|
||||||
8, BILINEAR_FILTERS_2TAP(yoffset));
|
8, BILINEAR_FILTERS_2TAP(yoffset));
|
||||||
return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
|
return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
|
unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
|
||||||
@ -180,77 +105,7 @@ unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
|
|||||||
BILINEAR_FILTERS_2TAP(xoffset));
|
BILINEAR_FILTERS_2TAP(xoffset));
|
||||||
var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
|
var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
|
||||||
16, BILINEAR_FILTERS_2TAP(yoffset));
|
16, BILINEAR_FILTERS_2TAP(yoffset));
|
||||||
return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
|
return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
|
|
||||||
const uint8_t *ref_ptr, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum) {
|
|
||||||
variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
|
|
||||||
32, sse, sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
|
|
||||||
const uint8_t *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
|
|
||||||
const uint8_t *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum1, sum2;
|
|
||||||
uint32_t sse1, sse2;
|
|
||||||
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
|
|
||||||
variance_neon_w8(a + (32 * a_stride), a_stride,
|
|
||||||
b + (32 * b_stride), b_stride, 32, 32,
|
|
||||||
&sse2, &sum2);
|
|
||||||
*sse = sse1 + sse2;
|
|
||||||
sum1 += sum2;
|
|
||||||
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
|
|
||||||
const uint8_t *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum1, sum2;
|
|
||||||
uint32_t sse1, sse2;
|
|
||||||
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
|
|
||||||
variance_neon_w8(a + (16 * a_stride), a_stride,
|
|
||||||
b + (16 * b_stride), b_stride, 64, 16,
|
|
||||||
&sse2, &sum2);
|
|
||||||
*sse = sse1 + sse2;
|
|
||||||
sum1 += sum2;
|
|
||||||
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
|
|
||||||
const uint8_t *b, int b_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum1, sum2;
|
|
||||||
uint32_t sse1, sse2;
|
|
||||||
|
|
||||||
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
|
|
||||||
variance_neon_w8(a + (16 * a_stride), a_stride,
|
|
||||||
b + (16 * b_stride), b_stride, 64, 16,
|
|
||||||
&sse2, &sum2);
|
|
||||||
sse1 += sse2;
|
|
||||||
sum1 += sum2;
|
|
||||||
|
|
||||||
variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
|
|
||||||
b + (16 * 2 * b_stride), b_stride,
|
|
||||||
64, 16, &sse2, &sum2);
|
|
||||||
sse1 += sse2;
|
|
||||||
sum1 += sum2;
|
|
||||||
|
|
||||||
variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
|
|
||||||
b + (16 * 3 * b_stride), b_stride,
|
|
||||||
64, 16, &sse2, &sum2);
|
|
||||||
*sse = sse1 + sse2;
|
|
||||||
sum1 += sum2;
|
|
||||||
return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
|
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
|
unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
|
||||||
@ -268,7 +123,7 @@ unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
|
|||||||
BILINEAR_FILTERS_2TAP(xoffset));
|
BILINEAR_FILTERS_2TAP(xoffset));
|
||||||
var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
|
var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
|
||||||
32, BILINEAR_FILTERS_2TAP(yoffset));
|
32, BILINEAR_FILTERS_2TAP(yoffset));
|
||||||
return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
|
return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
|
unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
|
||||||
@ -286,5 +141,5 @@ unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
|
|||||||
BILINEAR_FILTERS_2TAP(xoffset));
|
BILINEAR_FILTERS_2TAP(xoffset));
|
||||||
var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
|
var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
|
||||||
64, BILINEAR_FILTERS_2TAP(yoffset));
|
64, BILINEAR_FILTERS_2TAP(yoffset));
|
||||||
return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
|
return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
|
||||||
}
|
}
|
||||||
|
@ -98,9 +98,9 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
|
|||||||
int avg;
|
int avg;
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
|
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||||
highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride,
|
highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
|
||||||
CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
|
CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
|
||||||
&sse, &avg);
|
&sse, &avg);
|
||||||
sse >>= 2 * (xd->bd - 8);
|
sse >>= 2 * (xd->bd - 8);
|
||||||
avg >>= (xd->bd - 8);
|
avg >>= (xd->bd - 8);
|
||||||
} else {
|
} else {
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "./vp9_rtcd.h"
|
#include "./vp9_rtcd.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "./vpx_config.h"
|
#include "./vpx_config.h"
|
||||||
|
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
@ -3672,15 +3673,15 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
|
|||||||
if (cm->use_highbitdepth) {
|
if (cm->use_highbitdepth) {
|
||||||
switch (cm->bit_depth) {
|
switch (cm->bit_depth) {
|
||||||
case VPX_BITS_8:
|
case VPX_BITS_8:
|
||||||
vp9_highbd_get16x16var(src, src_stride, last_src, last_stride,
|
vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
|
||||||
&var16->sse, &var16->sum);
|
&var16->sse, &var16->sum);
|
||||||
break;
|
break;
|
||||||
case VPX_BITS_10:
|
case VPX_BITS_10:
|
||||||
vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
|
vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
|
||||||
&var16->sse, &var16->sum);
|
&var16->sse, &var16->sum);
|
||||||
break;
|
break;
|
||||||
case VPX_BITS_12:
|
case VPX_BITS_12:
|
||||||
vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
|
vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
|
||||||
&var16->sse, &var16->sum);
|
&var16->sse, &var16->sum);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@ -3689,11 +3690,11 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
vp9_get16x16var(src, src_stride, last_src, last_stride,
|
vpx_get16x16var(src, src_stride, last_src, last_stride,
|
||||||
&var16->sse, &var16->sum);
|
&var16->sse, &var16->sum);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
vp9_get16x16var(src, src_stride, last_src, last_stride,
|
vpx_get16x16var(src, src_stride, last_src, last_stride,
|
||||||
&var16->sse, &var16->sum);
|
&var16->sse, &var16->sum);
|
||||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||||
var16->var = var16->sse -
|
var16->var = var16->sse -
|
||||||
|
@ -998,7 +998,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_32X16,
|
HIGHBD_BFP(BLOCK_32X16,
|
||||||
vpx_highbd_sad32x16_bits8,
|
vpx_highbd_sad32x16_bits8,
|
||||||
vpx_highbd_sad32x16_avg_bits8,
|
vpx_highbd_sad32x16_avg_bits8,
|
||||||
vp9_highbd_variance32x16,
|
vpx_highbd_8_variance32x16,
|
||||||
vp9_highbd_sub_pixel_variance32x16,
|
vp9_highbd_sub_pixel_variance32x16,
|
||||||
vp9_highbd_sub_pixel_avg_variance32x16,
|
vp9_highbd_sub_pixel_avg_variance32x16,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1008,7 +1008,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_16X32,
|
HIGHBD_BFP(BLOCK_16X32,
|
||||||
vpx_highbd_sad16x32_bits8,
|
vpx_highbd_sad16x32_bits8,
|
||||||
vpx_highbd_sad16x32_avg_bits8,
|
vpx_highbd_sad16x32_avg_bits8,
|
||||||
vp9_highbd_variance16x32,
|
vpx_highbd_8_variance16x32,
|
||||||
vp9_highbd_sub_pixel_variance16x32,
|
vp9_highbd_sub_pixel_variance16x32,
|
||||||
vp9_highbd_sub_pixel_avg_variance16x32,
|
vp9_highbd_sub_pixel_avg_variance16x32,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1018,7 +1018,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_64X32,
|
HIGHBD_BFP(BLOCK_64X32,
|
||||||
vpx_highbd_sad64x32_bits8,
|
vpx_highbd_sad64x32_bits8,
|
||||||
vpx_highbd_sad64x32_avg_bits8,
|
vpx_highbd_sad64x32_avg_bits8,
|
||||||
vp9_highbd_variance64x32,
|
vpx_highbd_8_variance64x32,
|
||||||
vp9_highbd_sub_pixel_variance64x32,
|
vp9_highbd_sub_pixel_variance64x32,
|
||||||
vp9_highbd_sub_pixel_avg_variance64x32,
|
vp9_highbd_sub_pixel_avg_variance64x32,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1028,7 +1028,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_32X64,
|
HIGHBD_BFP(BLOCK_32X64,
|
||||||
vpx_highbd_sad32x64_bits8,
|
vpx_highbd_sad32x64_bits8,
|
||||||
vpx_highbd_sad32x64_avg_bits8,
|
vpx_highbd_sad32x64_avg_bits8,
|
||||||
vp9_highbd_variance32x64,
|
vpx_highbd_8_variance32x64,
|
||||||
vp9_highbd_sub_pixel_variance32x64,
|
vp9_highbd_sub_pixel_variance32x64,
|
||||||
vp9_highbd_sub_pixel_avg_variance32x64,
|
vp9_highbd_sub_pixel_avg_variance32x64,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1038,7 +1038,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_32X32,
|
HIGHBD_BFP(BLOCK_32X32,
|
||||||
vpx_highbd_sad32x32_bits8,
|
vpx_highbd_sad32x32_bits8,
|
||||||
vpx_highbd_sad32x32_avg_bits8,
|
vpx_highbd_sad32x32_avg_bits8,
|
||||||
vp9_highbd_variance32x32,
|
vpx_highbd_8_variance32x32,
|
||||||
vp9_highbd_sub_pixel_variance32x32,
|
vp9_highbd_sub_pixel_variance32x32,
|
||||||
vp9_highbd_sub_pixel_avg_variance32x32,
|
vp9_highbd_sub_pixel_avg_variance32x32,
|
||||||
vpx_highbd_sad32x32x3_bits8,
|
vpx_highbd_sad32x32x3_bits8,
|
||||||
@ -1048,7 +1048,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_64X64,
|
HIGHBD_BFP(BLOCK_64X64,
|
||||||
vpx_highbd_sad64x64_bits8,
|
vpx_highbd_sad64x64_bits8,
|
||||||
vpx_highbd_sad64x64_avg_bits8,
|
vpx_highbd_sad64x64_avg_bits8,
|
||||||
vp9_highbd_variance64x64,
|
vpx_highbd_8_variance64x64,
|
||||||
vp9_highbd_sub_pixel_variance64x64,
|
vp9_highbd_sub_pixel_variance64x64,
|
||||||
vp9_highbd_sub_pixel_avg_variance64x64,
|
vp9_highbd_sub_pixel_avg_variance64x64,
|
||||||
vpx_highbd_sad64x64x3_bits8,
|
vpx_highbd_sad64x64x3_bits8,
|
||||||
@ -1058,7 +1058,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_16X16,
|
HIGHBD_BFP(BLOCK_16X16,
|
||||||
vpx_highbd_sad16x16_bits8,
|
vpx_highbd_sad16x16_bits8,
|
||||||
vpx_highbd_sad16x16_avg_bits8,
|
vpx_highbd_sad16x16_avg_bits8,
|
||||||
vp9_highbd_variance16x16,
|
vpx_highbd_8_variance16x16,
|
||||||
vp9_highbd_sub_pixel_variance16x16,
|
vp9_highbd_sub_pixel_variance16x16,
|
||||||
vp9_highbd_sub_pixel_avg_variance16x16,
|
vp9_highbd_sub_pixel_avg_variance16x16,
|
||||||
vpx_highbd_sad16x16x3_bits8,
|
vpx_highbd_sad16x16x3_bits8,
|
||||||
@ -1068,7 +1068,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_16X8,
|
HIGHBD_BFP(BLOCK_16X8,
|
||||||
vpx_highbd_sad16x8_bits8,
|
vpx_highbd_sad16x8_bits8,
|
||||||
vpx_highbd_sad16x8_avg_bits8,
|
vpx_highbd_sad16x8_avg_bits8,
|
||||||
vp9_highbd_variance16x8,
|
vpx_highbd_8_variance16x8,
|
||||||
vp9_highbd_sub_pixel_variance16x8,
|
vp9_highbd_sub_pixel_variance16x8,
|
||||||
vp9_highbd_sub_pixel_avg_variance16x8,
|
vp9_highbd_sub_pixel_avg_variance16x8,
|
||||||
vpx_highbd_sad16x8x3_bits8,
|
vpx_highbd_sad16x8x3_bits8,
|
||||||
@ -1078,7 +1078,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_8X16,
|
HIGHBD_BFP(BLOCK_8X16,
|
||||||
vpx_highbd_sad8x16_bits8,
|
vpx_highbd_sad8x16_bits8,
|
||||||
vpx_highbd_sad8x16_avg_bits8,
|
vpx_highbd_sad8x16_avg_bits8,
|
||||||
vp9_highbd_variance8x16,
|
vpx_highbd_8_variance8x16,
|
||||||
vp9_highbd_sub_pixel_variance8x16,
|
vp9_highbd_sub_pixel_variance8x16,
|
||||||
vp9_highbd_sub_pixel_avg_variance8x16,
|
vp9_highbd_sub_pixel_avg_variance8x16,
|
||||||
vpx_highbd_sad8x16x3_bits8,
|
vpx_highbd_sad8x16x3_bits8,
|
||||||
@ -1088,7 +1088,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_8X8,
|
HIGHBD_BFP(BLOCK_8X8,
|
||||||
vpx_highbd_sad8x8_bits8,
|
vpx_highbd_sad8x8_bits8,
|
||||||
vpx_highbd_sad8x8_avg_bits8,
|
vpx_highbd_sad8x8_avg_bits8,
|
||||||
vp9_highbd_variance8x8,
|
vpx_highbd_8_variance8x8,
|
||||||
vp9_highbd_sub_pixel_variance8x8,
|
vp9_highbd_sub_pixel_variance8x8,
|
||||||
vp9_highbd_sub_pixel_avg_variance8x8,
|
vp9_highbd_sub_pixel_avg_variance8x8,
|
||||||
vpx_highbd_sad8x8x3_bits8,
|
vpx_highbd_sad8x8x3_bits8,
|
||||||
@ -1098,7 +1098,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_8X4,
|
HIGHBD_BFP(BLOCK_8X4,
|
||||||
vpx_highbd_sad8x4_bits8,
|
vpx_highbd_sad8x4_bits8,
|
||||||
vpx_highbd_sad8x4_avg_bits8,
|
vpx_highbd_sad8x4_avg_bits8,
|
||||||
vp9_highbd_variance8x4,
|
vpx_highbd_8_variance8x4,
|
||||||
vp9_highbd_sub_pixel_variance8x4,
|
vp9_highbd_sub_pixel_variance8x4,
|
||||||
vp9_highbd_sub_pixel_avg_variance8x4,
|
vp9_highbd_sub_pixel_avg_variance8x4,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1108,7 +1108,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_4X8,
|
HIGHBD_BFP(BLOCK_4X8,
|
||||||
vpx_highbd_sad4x8_bits8,
|
vpx_highbd_sad4x8_bits8,
|
||||||
vpx_highbd_sad4x8_avg_bits8,
|
vpx_highbd_sad4x8_avg_bits8,
|
||||||
vp9_highbd_variance4x8,
|
vpx_highbd_8_variance4x8,
|
||||||
vp9_highbd_sub_pixel_variance4x8,
|
vp9_highbd_sub_pixel_variance4x8,
|
||||||
vp9_highbd_sub_pixel_avg_variance4x8,
|
vp9_highbd_sub_pixel_avg_variance4x8,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1118,7 +1118,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_4X4,
|
HIGHBD_BFP(BLOCK_4X4,
|
||||||
vpx_highbd_sad4x4_bits8,
|
vpx_highbd_sad4x4_bits8,
|
||||||
vpx_highbd_sad4x4_avg_bits8,
|
vpx_highbd_sad4x4_avg_bits8,
|
||||||
vp9_highbd_variance4x4,
|
vpx_highbd_8_variance4x4,
|
||||||
vp9_highbd_sub_pixel_variance4x4,
|
vp9_highbd_sub_pixel_variance4x4,
|
||||||
vp9_highbd_sub_pixel_avg_variance4x4,
|
vp9_highbd_sub_pixel_avg_variance4x4,
|
||||||
vpx_highbd_sad4x4x3_bits8,
|
vpx_highbd_sad4x4x3_bits8,
|
||||||
@ -1130,7 +1130,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_32X16,
|
HIGHBD_BFP(BLOCK_32X16,
|
||||||
vpx_highbd_sad32x16_bits10,
|
vpx_highbd_sad32x16_bits10,
|
||||||
vpx_highbd_sad32x16_avg_bits10,
|
vpx_highbd_sad32x16_avg_bits10,
|
||||||
vp9_highbd_10_variance32x16,
|
vpx_highbd_10_variance32x16,
|
||||||
vp9_highbd_10_sub_pixel_variance32x16,
|
vp9_highbd_10_sub_pixel_variance32x16,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance32x16,
|
vp9_highbd_10_sub_pixel_avg_variance32x16,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1140,7 +1140,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_16X32,
|
HIGHBD_BFP(BLOCK_16X32,
|
||||||
vpx_highbd_sad16x32_bits10,
|
vpx_highbd_sad16x32_bits10,
|
||||||
vpx_highbd_sad16x32_avg_bits10,
|
vpx_highbd_sad16x32_avg_bits10,
|
||||||
vp9_highbd_10_variance16x32,
|
vpx_highbd_10_variance16x32,
|
||||||
vp9_highbd_10_sub_pixel_variance16x32,
|
vp9_highbd_10_sub_pixel_variance16x32,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance16x32,
|
vp9_highbd_10_sub_pixel_avg_variance16x32,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1150,7 +1150,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_64X32,
|
HIGHBD_BFP(BLOCK_64X32,
|
||||||
vpx_highbd_sad64x32_bits10,
|
vpx_highbd_sad64x32_bits10,
|
||||||
vpx_highbd_sad64x32_avg_bits10,
|
vpx_highbd_sad64x32_avg_bits10,
|
||||||
vp9_highbd_10_variance64x32,
|
vpx_highbd_10_variance64x32,
|
||||||
vp9_highbd_10_sub_pixel_variance64x32,
|
vp9_highbd_10_sub_pixel_variance64x32,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance64x32,
|
vp9_highbd_10_sub_pixel_avg_variance64x32,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1160,7 +1160,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_32X64,
|
HIGHBD_BFP(BLOCK_32X64,
|
||||||
vpx_highbd_sad32x64_bits10,
|
vpx_highbd_sad32x64_bits10,
|
||||||
vpx_highbd_sad32x64_avg_bits10,
|
vpx_highbd_sad32x64_avg_bits10,
|
||||||
vp9_highbd_10_variance32x64,
|
vpx_highbd_10_variance32x64,
|
||||||
vp9_highbd_10_sub_pixel_variance32x64,
|
vp9_highbd_10_sub_pixel_variance32x64,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance32x64,
|
vp9_highbd_10_sub_pixel_avg_variance32x64,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1170,7 +1170,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_32X32,
|
HIGHBD_BFP(BLOCK_32X32,
|
||||||
vpx_highbd_sad32x32_bits10,
|
vpx_highbd_sad32x32_bits10,
|
||||||
vpx_highbd_sad32x32_avg_bits10,
|
vpx_highbd_sad32x32_avg_bits10,
|
||||||
vp9_highbd_10_variance32x32,
|
vpx_highbd_10_variance32x32,
|
||||||
vp9_highbd_10_sub_pixel_variance32x32,
|
vp9_highbd_10_sub_pixel_variance32x32,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance32x32,
|
vp9_highbd_10_sub_pixel_avg_variance32x32,
|
||||||
vpx_highbd_sad32x32x3_bits10,
|
vpx_highbd_sad32x32x3_bits10,
|
||||||
@ -1180,7 +1180,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_64X64,
|
HIGHBD_BFP(BLOCK_64X64,
|
||||||
vpx_highbd_sad64x64_bits10,
|
vpx_highbd_sad64x64_bits10,
|
||||||
vpx_highbd_sad64x64_avg_bits10,
|
vpx_highbd_sad64x64_avg_bits10,
|
||||||
vp9_highbd_10_variance64x64,
|
vpx_highbd_10_variance64x64,
|
||||||
vp9_highbd_10_sub_pixel_variance64x64,
|
vp9_highbd_10_sub_pixel_variance64x64,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance64x64,
|
vp9_highbd_10_sub_pixel_avg_variance64x64,
|
||||||
vpx_highbd_sad64x64x3_bits10,
|
vpx_highbd_sad64x64x3_bits10,
|
||||||
@ -1190,7 +1190,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_16X16,
|
HIGHBD_BFP(BLOCK_16X16,
|
||||||
vpx_highbd_sad16x16_bits10,
|
vpx_highbd_sad16x16_bits10,
|
||||||
vpx_highbd_sad16x16_avg_bits10,
|
vpx_highbd_sad16x16_avg_bits10,
|
||||||
vp9_highbd_10_variance16x16,
|
vpx_highbd_10_variance16x16,
|
||||||
vp9_highbd_10_sub_pixel_variance16x16,
|
vp9_highbd_10_sub_pixel_variance16x16,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance16x16,
|
vp9_highbd_10_sub_pixel_avg_variance16x16,
|
||||||
vpx_highbd_sad16x16x3_bits10,
|
vpx_highbd_sad16x16x3_bits10,
|
||||||
@ -1200,7 +1200,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_16X8,
|
HIGHBD_BFP(BLOCK_16X8,
|
||||||
vpx_highbd_sad16x8_bits10,
|
vpx_highbd_sad16x8_bits10,
|
||||||
vpx_highbd_sad16x8_avg_bits10,
|
vpx_highbd_sad16x8_avg_bits10,
|
||||||
vp9_highbd_10_variance16x8,
|
vpx_highbd_10_variance16x8,
|
||||||
vp9_highbd_10_sub_pixel_variance16x8,
|
vp9_highbd_10_sub_pixel_variance16x8,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance16x8,
|
vp9_highbd_10_sub_pixel_avg_variance16x8,
|
||||||
vpx_highbd_sad16x8x3_bits10,
|
vpx_highbd_sad16x8x3_bits10,
|
||||||
@ -1210,7 +1210,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_8X16,
|
HIGHBD_BFP(BLOCK_8X16,
|
||||||
vpx_highbd_sad8x16_bits10,
|
vpx_highbd_sad8x16_bits10,
|
||||||
vpx_highbd_sad8x16_avg_bits10,
|
vpx_highbd_sad8x16_avg_bits10,
|
||||||
vp9_highbd_10_variance8x16,
|
vpx_highbd_10_variance8x16,
|
||||||
vp9_highbd_10_sub_pixel_variance8x16,
|
vp9_highbd_10_sub_pixel_variance8x16,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance8x16,
|
vp9_highbd_10_sub_pixel_avg_variance8x16,
|
||||||
vpx_highbd_sad8x16x3_bits10,
|
vpx_highbd_sad8x16x3_bits10,
|
||||||
@ -1220,7 +1220,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_8X8,
|
HIGHBD_BFP(BLOCK_8X8,
|
||||||
vpx_highbd_sad8x8_bits10,
|
vpx_highbd_sad8x8_bits10,
|
||||||
vpx_highbd_sad8x8_avg_bits10,
|
vpx_highbd_sad8x8_avg_bits10,
|
||||||
vp9_highbd_10_variance8x8,
|
vpx_highbd_10_variance8x8,
|
||||||
vp9_highbd_10_sub_pixel_variance8x8,
|
vp9_highbd_10_sub_pixel_variance8x8,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance8x8,
|
vp9_highbd_10_sub_pixel_avg_variance8x8,
|
||||||
vpx_highbd_sad8x8x3_bits10,
|
vpx_highbd_sad8x8x3_bits10,
|
||||||
@ -1230,7 +1230,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_8X4,
|
HIGHBD_BFP(BLOCK_8X4,
|
||||||
vpx_highbd_sad8x4_bits10,
|
vpx_highbd_sad8x4_bits10,
|
||||||
vpx_highbd_sad8x4_avg_bits10,
|
vpx_highbd_sad8x4_avg_bits10,
|
||||||
vp9_highbd_10_variance8x4,
|
vpx_highbd_10_variance8x4,
|
||||||
vp9_highbd_10_sub_pixel_variance8x4,
|
vp9_highbd_10_sub_pixel_variance8x4,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance8x4,
|
vp9_highbd_10_sub_pixel_avg_variance8x4,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1240,7 +1240,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_4X8,
|
HIGHBD_BFP(BLOCK_4X8,
|
||||||
vpx_highbd_sad4x8_bits10,
|
vpx_highbd_sad4x8_bits10,
|
||||||
vpx_highbd_sad4x8_avg_bits10,
|
vpx_highbd_sad4x8_avg_bits10,
|
||||||
vp9_highbd_10_variance4x8,
|
vpx_highbd_10_variance4x8,
|
||||||
vp9_highbd_10_sub_pixel_variance4x8,
|
vp9_highbd_10_sub_pixel_variance4x8,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance4x8,
|
vp9_highbd_10_sub_pixel_avg_variance4x8,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1250,7 +1250,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_4X4,
|
HIGHBD_BFP(BLOCK_4X4,
|
||||||
vpx_highbd_sad4x4_bits10,
|
vpx_highbd_sad4x4_bits10,
|
||||||
vpx_highbd_sad4x4_avg_bits10,
|
vpx_highbd_sad4x4_avg_bits10,
|
||||||
vp9_highbd_10_variance4x4,
|
vpx_highbd_10_variance4x4,
|
||||||
vp9_highbd_10_sub_pixel_variance4x4,
|
vp9_highbd_10_sub_pixel_variance4x4,
|
||||||
vp9_highbd_10_sub_pixel_avg_variance4x4,
|
vp9_highbd_10_sub_pixel_avg_variance4x4,
|
||||||
vpx_highbd_sad4x4x3_bits10,
|
vpx_highbd_sad4x4x3_bits10,
|
||||||
@ -1262,7 +1262,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_32X16,
|
HIGHBD_BFP(BLOCK_32X16,
|
||||||
vpx_highbd_sad32x16_bits12,
|
vpx_highbd_sad32x16_bits12,
|
||||||
vpx_highbd_sad32x16_avg_bits12,
|
vpx_highbd_sad32x16_avg_bits12,
|
||||||
vp9_highbd_12_variance32x16,
|
vpx_highbd_12_variance32x16,
|
||||||
vp9_highbd_12_sub_pixel_variance32x16,
|
vp9_highbd_12_sub_pixel_variance32x16,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance32x16,
|
vp9_highbd_12_sub_pixel_avg_variance32x16,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1272,7 +1272,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_16X32,
|
HIGHBD_BFP(BLOCK_16X32,
|
||||||
vpx_highbd_sad16x32_bits12,
|
vpx_highbd_sad16x32_bits12,
|
||||||
vpx_highbd_sad16x32_avg_bits12,
|
vpx_highbd_sad16x32_avg_bits12,
|
||||||
vp9_highbd_12_variance16x32,
|
vpx_highbd_12_variance16x32,
|
||||||
vp9_highbd_12_sub_pixel_variance16x32,
|
vp9_highbd_12_sub_pixel_variance16x32,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance16x32,
|
vp9_highbd_12_sub_pixel_avg_variance16x32,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1282,7 +1282,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_64X32,
|
HIGHBD_BFP(BLOCK_64X32,
|
||||||
vpx_highbd_sad64x32_bits12,
|
vpx_highbd_sad64x32_bits12,
|
||||||
vpx_highbd_sad64x32_avg_bits12,
|
vpx_highbd_sad64x32_avg_bits12,
|
||||||
vp9_highbd_12_variance64x32,
|
vpx_highbd_12_variance64x32,
|
||||||
vp9_highbd_12_sub_pixel_variance64x32,
|
vp9_highbd_12_sub_pixel_variance64x32,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance64x32,
|
vp9_highbd_12_sub_pixel_avg_variance64x32,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1292,7 +1292,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_32X64,
|
HIGHBD_BFP(BLOCK_32X64,
|
||||||
vpx_highbd_sad32x64_bits12,
|
vpx_highbd_sad32x64_bits12,
|
||||||
vpx_highbd_sad32x64_avg_bits12,
|
vpx_highbd_sad32x64_avg_bits12,
|
||||||
vp9_highbd_12_variance32x64,
|
vpx_highbd_12_variance32x64,
|
||||||
vp9_highbd_12_sub_pixel_variance32x64,
|
vp9_highbd_12_sub_pixel_variance32x64,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance32x64,
|
vp9_highbd_12_sub_pixel_avg_variance32x64,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1302,7 +1302,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_32X32,
|
HIGHBD_BFP(BLOCK_32X32,
|
||||||
vpx_highbd_sad32x32_bits12,
|
vpx_highbd_sad32x32_bits12,
|
||||||
vpx_highbd_sad32x32_avg_bits12,
|
vpx_highbd_sad32x32_avg_bits12,
|
||||||
vp9_highbd_12_variance32x32,
|
vpx_highbd_12_variance32x32,
|
||||||
vp9_highbd_12_sub_pixel_variance32x32,
|
vp9_highbd_12_sub_pixel_variance32x32,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance32x32,
|
vp9_highbd_12_sub_pixel_avg_variance32x32,
|
||||||
vpx_highbd_sad32x32x3_bits12,
|
vpx_highbd_sad32x32x3_bits12,
|
||||||
@ -1312,7 +1312,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_64X64,
|
HIGHBD_BFP(BLOCK_64X64,
|
||||||
vpx_highbd_sad64x64_bits12,
|
vpx_highbd_sad64x64_bits12,
|
||||||
vpx_highbd_sad64x64_avg_bits12,
|
vpx_highbd_sad64x64_avg_bits12,
|
||||||
vp9_highbd_12_variance64x64,
|
vpx_highbd_12_variance64x64,
|
||||||
vp9_highbd_12_sub_pixel_variance64x64,
|
vp9_highbd_12_sub_pixel_variance64x64,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance64x64,
|
vp9_highbd_12_sub_pixel_avg_variance64x64,
|
||||||
vpx_highbd_sad64x64x3_bits12,
|
vpx_highbd_sad64x64x3_bits12,
|
||||||
@ -1322,7 +1322,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_16X16,
|
HIGHBD_BFP(BLOCK_16X16,
|
||||||
vpx_highbd_sad16x16_bits12,
|
vpx_highbd_sad16x16_bits12,
|
||||||
vpx_highbd_sad16x16_avg_bits12,
|
vpx_highbd_sad16x16_avg_bits12,
|
||||||
vp9_highbd_12_variance16x16,
|
vpx_highbd_12_variance16x16,
|
||||||
vp9_highbd_12_sub_pixel_variance16x16,
|
vp9_highbd_12_sub_pixel_variance16x16,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance16x16,
|
vp9_highbd_12_sub_pixel_avg_variance16x16,
|
||||||
vpx_highbd_sad16x16x3_bits12,
|
vpx_highbd_sad16x16x3_bits12,
|
||||||
@ -1332,7 +1332,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_16X8,
|
HIGHBD_BFP(BLOCK_16X8,
|
||||||
vpx_highbd_sad16x8_bits12,
|
vpx_highbd_sad16x8_bits12,
|
||||||
vpx_highbd_sad16x8_avg_bits12,
|
vpx_highbd_sad16x8_avg_bits12,
|
||||||
vp9_highbd_12_variance16x8,
|
vpx_highbd_12_variance16x8,
|
||||||
vp9_highbd_12_sub_pixel_variance16x8,
|
vp9_highbd_12_sub_pixel_variance16x8,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance16x8,
|
vp9_highbd_12_sub_pixel_avg_variance16x8,
|
||||||
vpx_highbd_sad16x8x3_bits12,
|
vpx_highbd_sad16x8x3_bits12,
|
||||||
@ -1342,7 +1342,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_8X16,
|
HIGHBD_BFP(BLOCK_8X16,
|
||||||
vpx_highbd_sad8x16_bits12,
|
vpx_highbd_sad8x16_bits12,
|
||||||
vpx_highbd_sad8x16_avg_bits12,
|
vpx_highbd_sad8x16_avg_bits12,
|
||||||
vp9_highbd_12_variance8x16,
|
vpx_highbd_12_variance8x16,
|
||||||
vp9_highbd_12_sub_pixel_variance8x16,
|
vp9_highbd_12_sub_pixel_variance8x16,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance8x16,
|
vp9_highbd_12_sub_pixel_avg_variance8x16,
|
||||||
vpx_highbd_sad8x16x3_bits12,
|
vpx_highbd_sad8x16x3_bits12,
|
||||||
@ -1352,7 +1352,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_8X8,
|
HIGHBD_BFP(BLOCK_8X8,
|
||||||
vpx_highbd_sad8x8_bits12,
|
vpx_highbd_sad8x8_bits12,
|
||||||
vpx_highbd_sad8x8_avg_bits12,
|
vpx_highbd_sad8x8_avg_bits12,
|
||||||
vp9_highbd_12_variance8x8,
|
vpx_highbd_12_variance8x8,
|
||||||
vp9_highbd_12_sub_pixel_variance8x8,
|
vp9_highbd_12_sub_pixel_variance8x8,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance8x8,
|
vp9_highbd_12_sub_pixel_avg_variance8x8,
|
||||||
vpx_highbd_sad8x8x3_bits12,
|
vpx_highbd_sad8x8x3_bits12,
|
||||||
@ -1362,7 +1362,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_8X4,
|
HIGHBD_BFP(BLOCK_8X4,
|
||||||
vpx_highbd_sad8x4_bits12,
|
vpx_highbd_sad8x4_bits12,
|
||||||
vpx_highbd_sad8x4_avg_bits12,
|
vpx_highbd_sad8x4_avg_bits12,
|
||||||
vp9_highbd_12_variance8x4,
|
vpx_highbd_12_variance8x4,
|
||||||
vp9_highbd_12_sub_pixel_variance8x4,
|
vp9_highbd_12_sub_pixel_variance8x4,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance8x4,
|
vp9_highbd_12_sub_pixel_avg_variance8x4,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1372,7 +1372,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_4X8,
|
HIGHBD_BFP(BLOCK_4X8,
|
||||||
vpx_highbd_sad4x8_bits12,
|
vpx_highbd_sad4x8_bits12,
|
||||||
vpx_highbd_sad4x8_avg_bits12,
|
vpx_highbd_sad4x8_avg_bits12,
|
||||||
vp9_highbd_12_variance4x8,
|
vpx_highbd_12_variance4x8,
|
||||||
vp9_highbd_12_sub_pixel_variance4x8,
|
vp9_highbd_12_sub_pixel_variance4x8,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance4x8,
|
vp9_highbd_12_sub_pixel_avg_variance4x8,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1382,7 +1382,7 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
|
|||||||
HIGHBD_BFP(BLOCK_4X4,
|
HIGHBD_BFP(BLOCK_4X4,
|
||||||
vpx_highbd_sad4x4_bits12,
|
vpx_highbd_sad4x4_bits12,
|
||||||
vpx_highbd_sad4x4_avg_bits12,
|
vpx_highbd_sad4x4_avg_bits12,
|
||||||
vp9_highbd_12_variance4x4,
|
vpx_highbd_12_variance4x4,
|
||||||
vp9_highbd_12_sub_pixel_variance4x4,
|
vp9_highbd_12_sub_pixel_variance4x4,
|
||||||
vp9_highbd_12_sub_pixel_avg_variance4x4,
|
vp9_highbd_12_sub_pixel_avg_variance4x4,
|
||||||
vpx_highbd_sad4x4x3_bits12,
|
vpx_highbd_sad4x4x3_bits12,
|
||||||
@ -1805,61 +1805,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
|
|||||||
cpi->fn_ptr[BT].sdx4df = SDX4DF;
|
cpi->fn_ptr[BT].sdx4df = SDX4DF;
|
||||||
|
|
||||||
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
|
BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
|
||||||
vp9_variance32x16, vp9_sub_pixel_variance32x16,
|
vpx_variance32x16, vp9_sub_pixel_variance32x16,
|
||||||
vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
|
vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
|
||||||
|
|
||||||
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
|
BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
|
||||||
vp9_variance16x32, vp9_sub_pixel_variance16x32,
|
vpx_variance16x32, vp9_sub_pixel_variance16x32,
|
||||||
vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
|
vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
|
||||||
|
|
||||||
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
|
BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
|
||||||
vp9_variance64x32, vp9_sub_pixel_variance64x32,
|
vpx_variance64x32, vp9_sub_pixel_variance64x32,
|
||||||
vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
|
vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
|
||||||
|
|
||||||
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
|
BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
|
||||||
vp9_variance32x64, vp9_sub_pixel_variance32x64,
|
vpx_variance32x64, vp9_sub_pixel_variance32x64,
|
||||||
vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
|
vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
|
||||||
|
|
||||||
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
|
BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
|
||||||
vp9_variance32x32, vp9_sub_pixel_variance32x32,
|
vpx_variance32x32, vp9_sub_pixel_variance32x32,
|
||||||
vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
|
vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
|
||||||
vpx_sad32x32x4d)
|
vpx_sad32x32x4d)
|
||||||
|
|
||||||
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
|
BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
|
||||||
vp9_variance64x64, vp9_sub_pixel_variance64x64,
|
vpx_variance64x64, vp9_sub_pixel_variance64x64,
|
||||||
vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
|
vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
|
||||||
vpx_sad64x64x4d)
|
vpx_sad64x64x4d)
|
||||||
|
|
||||||
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
|
BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
|
||||||
vp9_variance16x16, vp9_sub_pixel_variance16x16,
|
vpx_variance16x16, vp9_sub_pixel_variance16x16,
|
||||||
vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
|
vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
|
||||||
vpx_sad16x16x4d)
|
vpx_sad16x16x4d)
|
||||||
|
|
||||||
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
|
BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
|
||||||
vp9_variance16x8, vp9_sub_pixel_variance16x8,
|
vpx_variance16x8, vp9_sub_pixel_variance16x8,
|
||||||
vp9_sub_pixel_avg_variance16x8,
|
vp9_sub_pixel_avg_variance16x8,
|
||||||
vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
|
vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
|
||||||
|
|
||||||
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
|
BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
|
||||||
vp9_variance8x16, vp9_sub_pixel_variance8x16,
|
vpx_variance8x16, vp9_sub_pixel_variance8x16,
|
||||||
vp9_sub_pixel_avg_variance8x16,
|
vp9_sub_pixel_avg_variance8x16,
|
||||||
vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
|
vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
|
||||||
|
|
||||||
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
|
BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
|
||||||
vp9_variance8x8, vp9_sub_pixel_variance8x8,
|
vpx_variance8x8, vp9_sub_pixel_variance8x8,
|
||||||
vp9_sub_pixel_avg_variance8x8,
|
vp9_sub_pixel_avg_variance8x8,
|
||||||
vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
|
vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
|
||||||
|
|
||||||
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
|
BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
|
||||||
vp9_variance8x4, vp9_sub_pixel_variance8x4,
|
vpx_variance8x4, vp9_sub_pixel_variance8x4,
|
||||||
vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
|
vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
|
||||||
|
|
||||||
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
|
BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
|
||||||
vp9_variance4x8, vp9_sub_pixel_variance4x8,
|
vpx_variance4x8, vp9_sub_pixel_variance4x8,
|
||||||
vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
|
vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
|
||||||
|
|
||||||
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
|
BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
|
||||||
vp9_variance4x4, vp9_sub_pixel_variance4x4,
|
vpx_variance4x4, vp9_sub_pixel_variance4x4,
|
||||||
vp9_sub_pixel_avg_variance4x4,
|
vp9_sub_pixel_avg_variance4x4,
|
||||||
vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
|
vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
|
||||||
|
|
||||||
@ -2079,7 +2079,7 @@ static int64_t get_sse(const uint8_t *a, int a_stride,
|
|||||||
const uint8_t *pa = a;
|
const uint8_t *pa = a;
|
||||||
const uint8_t *pb = b;
|
const uint8_t *pb = b;
|
||||||
for (x = 0; x < width / 16; ++x) {
|
for (x = 0; x < width / 16; ++x) {
|
||||||
vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
||||||
total_sse += sse;
|
total_sse += sse;
|
||||||
|
|
||||||
pa += 16;
|
pa += 16;
|
||||||
@ -2124,21 +2124,21 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
|
|||||||
unsigned int sse = 0;
|
unsigned int sse = 0;
|
||||||
int sum = 0;
|
int sum = 0;
|
||||||
if (dw > 0) {
|
if (dw > 0) {
|
||||||
highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
|
highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
|
||||||
dw, height, &sse, &sum);
|
dw, height, &sse, &sum);
|
||||||
total_sse += sse;
|
total_sse += sse;
|
||||||
}
|
}
|
||||||
if (dh > 0) {
|
if (dh > 0) {
|
||||||
highbd_variance(&a[(height - dh) * a_stride], a_stride,
|
highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
|
||||||
&b[(height - dh) * b_stride], b_stride,
|
&b[(height - dh) * b_stride], b_stride,
|
||||||
width - dw, dh, &sse, &sum);
|
width - dw, dh, &sse, &sum);
|
||||||
total_sse += sse;
|
total_sse += sse;
|
||||||
}
|
}
|
||||||
for (y = 0; y < height / 16; ++y) {
|
for (y = 0; y < height / 16; ++y) {
|
||||||
const uint8_t *pa = a;
|
const uint8_t *pa = a;
|
||||||
const uint8_t *pb = b;
|
const uint8_t *pb = b;
|
||||||
for (x = 0; x < width / 16; ++x) {
|
for (x = 0; x < width / 16; ++x) {
|
||||||
vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
||||||
total_sse += sse;
|
total_sse += sse;
|
||||||
pa += 16;
|
pa += 16;
|
||||||
pb += 16;
|
pb += 16;
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
#include <math.h>
|
#include <math.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
#include "./vpx_scale_rtcd.h"
|
#include "./vpx_scale_rtcd.h"
|
||||||
|
|
||||||
#include "vpx_mem/vpx_mem.h"
|
#include "vpx_mem/vpx_mem.h"
|
||||||
@ -267,13 +268,13 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
|
|||||||
static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
|
static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
|
||||||
switch (bsize) {
|
switch (bsize) {
|
||||||
case BLOCK_8X8:
|
case BLOCK_8X8:
|
||||||
return vp9_mse8x8;
|
return vpx_mse8x8;
|
||||||
case BLOCK_16X8:
|
case BLOCK_16X8:
|
||||||
return vp9_mse16x8;
|
return vpx_mse16x8;
|
||||||
case BLOCK_8X16:
|
case BLOCK_8X16:
|
||||||
return vp9_mse8x16;
|
return vpx_mse8x16;
|
||||||
default:
|
default:
|
||||||
return vp9_mse16x16;
|
return vpx_mse16x16;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -293,37 +294,37 @@ static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
|
|||||||
default:
|
default:
|
||||||
switch (bsize) {
|
switch (bsize) {
|
||||||
case BLOCK_8X8:
|
case BLOCK_8X8:
|
||||||
return vp9_highbd_mse8x8;
|
return vpx_highbd_8_mse8x8;
|
||||||
case BLOCK_16X8:
|
case BLOCK_16X8:
|
||||||
return vp9_highbd_mse16x8;
|
return vpx_highbd_8_mse16x8;
|
||||||
case BLOCK_8X16:
|
case BLOCK_8X16:
|
||||||
return vp9_highbd_mse8x16;
|
return vpx_highbd_8_mse8x16;
|
||||||
default:
|
default:
|
||||||
return vp9_highbd_mse16x16;
|
return vpx_highbd_8_mse16x16;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 10:
|
case 10:
|
||||||
switch (bsize) {
|
switch (bsize) {
|
||||||
case BLOCK_8X8:
|
case BLOCK_8X8:
|
||||||
return vp9_highbd_10_mse8x8;
|
return vpx_highbd_10_mse8x8;
|
||||||
case BLOCK_16X8:
|
case BLOCK_16X8:
|
||||||
return vp9_highbd_10_mse16x8;
|
return vpx_highbd_10_mse16x8;
|
||||||
case BLOCK_8X16:
|
case BLOCK_8X16:
|
||||||
return vp9_highbd_10_mse8x16;
|
return vpx_highbd_10_mse8x16;
|
||||||
default:
|
default:
|
||||||
return vp9_highbd_10_mse16x16;
|
return vpx_highbd_10_mse16x16;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 12:
|
case 12:
|
||||||
switch (bsize) {
|
switch (bsize) {
|
||||||
case BLOCK_8X8:
|
case BLOCK_8X8:
|
||||||
return vp9_highbd_12_mse8x8;
|
return vpx_highbd_12_mse8x8;
|
||||||
case BLOCK_16X8:
|
case BLOCK_16X8:
|
||||||
return vp9_highbd_12_mse16x8;
|
return vpx_highbd_12_mse16x8;
|
||||||
case BLOCK_8X16:
|
case BLOCK_8X16:
|
||||||
return vp9_highbd_12_mse8x16;
|
return vpx_highbd_12_mse8x16;
|
||||||
default:
|
default:
|
||||||
return vp9_highbd_12_mse16x16;
|
return vpx_highbd_12_mse16x16;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -634,7 +635,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
|
|||||||
xd->mi[0]->mbmi.tx_size = use_dc_pred ?
|
xd->mi[0]->mbmi.tx_size = use_dc_pred ?
|
||||||
(bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
|
(bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
|
||||||
vp9_encode_intra_block_plane(x, bsize, 0);
|
vp9_encode_intra_block_plane(x, bsize, 0);
|
||||||
this_error = vp9_get_mb_ss(x->plane[0].src_diff);
|
this_error = vpx_get_mb_ss(x->plane[0].src_diff);
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
if (cm->use_highbitdepth) {
|
if (cm->use_highbitdepth) {
|
||||||
switch (cm->bit_depth) {
|
switch (cm->bit_depth) {
|
||||||
|
@ -13,6 +13,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "./vpx_config.h"
|
#include "./vpx_config.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
|
|
||||||
#include "vpx_mem/vpx_mem.h"
|
#include "vpx_mem/vpx_mem.h"
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
@ -303,13 +304,13 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
|
|||||||
if (second_pred != NULL) {
|
if (second_pred != NULL) {
|
||||||
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
|
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
|
||||||
DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
|
DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
|
||||||
vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
|
vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
|
||||||
y_stride);
|
y_stride);
|
||||||
besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
|
besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
|
||||||
sse1);
|
sse1);
|
||||||
} else {
|
} else {
|
||||||
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
|
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
|
||||||
vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
|
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
|
||||||
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
|
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -321,7 +322,7 @@ static INLINE unsigned int setup_center_error(const MACROBLOCKD *xd,
|
|||||||
(void) xd;
|
(void) xd;
|
||||||
if (second_pred != NULL) {
|
if (second_pred != NULL) {
|
||||||
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
|
DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
|
||||||
vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
|
vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
|
||||||
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
|
besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
|
||||||
} else {
|
} else {
|
||||||
besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
|
besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
|
||||||
|
@ -14,6 +14,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
#include "./vp9_rtcd.h"
|
#include "./vp9_rtcd.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
|
|
||||||
#include "vpx_mem/vpx_mem.h"
|
#include "vpx_mem/vpx_mem.h"
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
@ -215,7 +216,7 @@ static void block_variance(const uint8_t *src, int src_stride,
|
|||||||
|
|
||||||
for (i = 0; i < h; i += block_size) {
|
for (i = 0; i < h; i += block_size) {
|
||||||
for (j = 0; j < w; j += block_size) {
|
for (j = 0; j < w; j += block_size) {
|
||||||
vp9_get8x8var(src + src_stride * i + j, src_stride,
|
vpx_get8x8var(src + src_stride * i + j, src_stride,
|
||||||
ref + ref_stride * i + j, ref_stride,
|
ref + ref_stride * i + j, ref_stride,
|
||||||
&sse8x8[k], &sum8x8[k]);
|
&sse8x8[k], &sum8x8[k]);
|
||||||
*sse += sse8x8[k];
|
*sse += sse8x8[k];
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include "./vp9_rtcd.h"
|
#include "./vp9_rtcd.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
|
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
#include "vpx/vpx_integer.h"
|
#include "vpx/vpx_integer.h"
|
||||||
@ -18,26 +19,6 @@
|
|||||||
|
|
||||||
#include "vp9/encoder/vp9_variance.h"
|
#include "vp9/encoder/vp9_variance.h"
|
||||||
|
|
||||||
void variance(const uint8_t *a, int a_stride,
|
|
||||||
const uint8_t *b, int b_stride,
|
|
||||||
int w, int h, unsigned int *sse, int *sum) {
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
*sum = 0;
|
|
||||||
*sse = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < h; i++) {
|
|
||||||
for (j = 0; j < w; j++) {
|
|
||||||
const int diff = a[j] - b[j];
|
|
||||||
*sum += diff;
|
|
||||||
*sse += diff * diff;
|
|
||||||
}
|
|
||||||
|
|
||||||
a += a_stride;
|
|
||||||
b += b_stride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
|
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
|
||||||
// or vertical direction to produce the filtered output block. Used to implement
|
// or vertical direction to produce the filtered output block. Used to implement
|
||||||
// first-pass of 2-D separable filter.
|
// first-pass of 2-D separable filter.
|
||||||
@ -100,25 +81,6 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
|
|
||||||
unsigned int i, sum = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < 256; ++i) {
|
|
||||||
sum += src_ptr[i] * src_ptr[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define VAR(W, H) \
|
|
||||||
unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
|
||||||
const uint8_t *b, int b_stride, \
|
|
||||||
unsigned int *sse) { \
|
|
||||||
int sum; \
|
|
||||||
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
|
||||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define SUBPIX_VAR(W, H) \
|
#define SUBPIX_VAR(W, H) \
|
||||||
unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
|
unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
|
||||||
const uint8_t *src, int src_stride, \
|
const uint8_t *src, int src_stride, \
|
||||||
@ -133,7 +95,7 @@ unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
|
|||||||
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||||
\
|
\
|
||||||
return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
|
return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define SUBPIX_AVG_VAR(W, H) \
|
#define SUBPIX_AVG_VAR(W, H) \
|
||||||
@ -152,178 +114,51 @@ unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
|
|||||||
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||||
\
|
\
|
||||||
vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
|
vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
|
||||||
\
|
\
|
||||||
return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
|
return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
|
|
||||||
const uint8_t *ref_ptr, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum) {
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
|
|
||||||
const uint8_t *ref_ptr, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum) {
|
|
||||||
variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
VAR(4, 4)
|
|
||||||
SUBPIX_VAR(4, 4)
|
SUBPIX_VAR(4, 4)
|
||||||
SUBPIX_AVG_VAR(4, 4)
|
SUBPIX_AVG_VAR(4, 4)
|
||||||
|
|
||||||
VAR(4, 8)
|
|
||||||
SUBPIX_VAR(4, 8)
|
SUBPIX_VAR(4, 8)
|
||||||
SUBPIX_AVG_VAR(4, 8)
|
SUBPIX_AVG_VAR(4, 8)
|
||||||
|
|
||||||
VAR(8, 4)
|
|
||||||
SUBPIX_VAR(8, 4)
|
SUBPIX_VAR(8, 4)
|
||||||
SUBPIX_AVG_VAR(8, 4)
|
SUBPIX_AVG_VAR(8, 4)
|
||||||
|
|
||||||
VAR(8, 8)
|
|
||||||
SUBPIX_VAR(8, 8)
|
SUBPIX_VAR(8, 8)
|
||||||
SUBPIX_AVG_VAR(8, 8)
|
SUBPIX_AVG_VAR(8, 8)
|
||||||
|
|
||||||
VAR(8, 16)
|
|
||||||
SUBPIX_VAR(8, 16)
|
SUBPIX_VAR(8, 16)
|
||||||
SUBPIX_AVG_VAR(8, 16)
|
SUBPIX_AVG_VAR(8, 16)
|
||||||
|
|
||||||
VAR(16, 8)
|
|
||||||
SUBPIX_VAR(16, 8)
|
SUBPIX_VAR(16, 8)
|
||||||
SUBPIX_AVG_VAR(16, 8)
|
SUBPIX_AVG_VAR(16, 8)
|
||||||
|
|
||||||
VAR(16, 16)
|
|
||||||
SUBPIX_VAR(16, 16)
|
SUBPIX_VAR(16, 16)
|
||||||
SUBPIX_AVG_VAR(16, 16)
|
SUBPIX_AVG_VAR(16, 16)
|
||||||
|
|
||||||
VAR(16, 32)
|
|
||||||
SUBPIX_VAR(16, 32)
|
SUBPIX_VAR(16, 32)
|
||||||
SUBPIX_AVG_VAR(16, 32)
|
SUBPIX_AVG_VAR(16, 32)
|
||||||
|
|
||||||
VAR(32, 16)
|
|
||||||
SUBPIX_VAR(32, 16)
|
SUBPIX_VAR(32, 16)
|
||||||
SUBPIX_AVG_VAR(32, 16)
|
SUBPIX_AVG_VAR(32, 16)
|
||||||
|
|
||||||
VAR(32, 32)
|
|
||||||
SUBPIX_VAR(32, 32)
|
SUBPIX_VAR(32, 32)
|
||||||
SUBPIX_AVG_VAR(32, 32)
|
SUBPIX_AVG_VAR(32, 32)
|
||||||
|
|
||||||
VAR(32, 64)
|
|
||||||
SUBPIX_VAR(32, 64)
|
SUBPIX_VAR(32, 64)
|
||||||
SUBPIX_AVG_VAR(32, 64)
|
SUBPIX_AVG_VAR(32, 64)
|
||||||
|
|
||||||
VAR(64, 32)
|
|
||||||
SUBPIX_VAR(64, 32)
|
SUBPIX_VAR(64, 32)
|
||||||
SUBPIX_AVG_VAR(64, 32)
|
SUBPIX_AVG_VAR(64, 32)
|
||||||
|
|
||||||
VAR(64, 64)
|
|
||||||
SUBPIX_VAR(64, 64)
|
SUBPIX_VAR(64, 64)
|
||||||
SUBPIX_AVG_VAR(64, 64)
|
SUBPIX_AVG_VAR(64, 64)
|
||||||
|
|
||||||
void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
|
|
||||||
int height, const uint8_t *ref, int ref_stride) {
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
for (i = 0; i < height; i++) {
|
|
||||||
for (j = 0; j < width; j++) {
|
|
||||||
const int tmp = pred[j] + ref[j];
|
|
||||||
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
|
||||||
}
|
|
||||||
comp_pred += width;
|
|
||||||
pred += width;
|
|
||||||
ref += ref_stride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
void highbd_variance64(const uint8_t *a8, int a_stride,
|
|
||||||
const uint8_t *b8, int b_stride,
|
|
||||||
int w, int h, uint64_t *sse,
|
|
||||||
uint64_t *sum) {
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
|
||||||
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
|
||||||
*sum = 0;
|
|
||||||
*sse = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < h; i++) {
|
|
||||||
for (j = 0; j < w; j++) {
|
|
||||||
const int diff = a[j] - b[j];
|
|
||||||
*sum += diff;
|
|
||||||
*sse += diff * diff;
|
|
||||||
}
|
|
||||||
a += a_stride;
|
|
||||||
b += b_stride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void highbd_variance(const uint8_t *a8, int a_stride,
|
|
||||||
const uint8_t *b8, int b_stride,
|
|
||||||
int w, int h, unsigned int *sse,
|
|
||||||
int *sum) {
|
|
||||||
uint64_t sse_long = 0;
|
|
||||||
uint64_t sum_long = 0;
|
|
||||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
|
||||||
*sse = (unsigned int)sse_long;
|
|
||||||
*sum = (int)sum_long;
|
|
||||||
}
|
|
||||||
|
|
||||||
void highbd_10_variance(const uint8_t *a8, int a_stride,
|
|
||||||
const uint8_t *b8, int b_stride,
|
|
||||||
int w, int h, unsigned int *sse,
|
|
||||||
int *sum) {
|
|
||||||
uint64_t sse_long = 0;
|
|
||||||
uint64_t sum_long = 0;
|
|
||||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
|
||||||
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
|
|
||||||
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
void highbd_12_variance(const uint8_t *a8, int a_stride,
|
|
||||||
const uint8_t *b8, int b_stride,
|
|
||||||
int w, int h, unsigned int *sse,
|
|
||||||
int *sum) {
|
|
||||||
uint64_t sse_long = 0;
|
|
||||||
uint64_t sum_long = 0;
|
|
||||||
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
|
||||||
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
|
|
||||||
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void highbd_var_filter_block2d_bil_first_pass(
|
static void highbd_var_filter_block2d_bil_first_pass(
|
||||||
const uint8_t *src_ptr8,
|
const uint8_t *src_ptr8,
|
||||||
uint16_t *output_ptr,
|
uint16_t *output_ptr,
|
||||||
@ -374,35 +209,6 @@ static void highbd_var_filter_block2d_bil_second_pass(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HIGHBD_VAR(W, H) \
|
|
||||||
unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
|
||||||
const uint8_t *b, int b_stride, \
|
|
||||||
unsigned int *sse) { \
|
|
||||||
int sum; \
|
|
||||||
highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
|
||||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
|
|
||||||
int a_stride, \
|
|
||||||
const uint8_t *b, \
|
|
||||||
int b_stride, \
|
|
||||||
unsigned int *sse) { \
|
|
||||||
int sum; \
|
|
||||||
highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
|
||||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
|
|
||||||
int a_stride, \
|
|
||||||
const uint8_t *b, \
|
|
||||||
int b_stride, \
|
|
||||||
unsigned int *sse) { \
|
|
||||||
int sum; \
|
|
||||||
highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
|
||||||
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define HIGHBD_SUBPIX_VAR(W, H) \
|
#define HIGHBD_SUBPIX_VAR(W, H) \
|
||||||
unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
|
unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
|
||||||
const uint8_t *src, int src_stride, \
|
const uint8_t *src, int src_stride, \
|
||||||
@ -417,7 +223,7 @@ unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
|
|||||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||||
\
|
\
|
||||||
return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
|
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
|
||||||
dst_stride, sse); \
|
dst_stride, sse); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
@ -434,7 +240,7 @@ unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
|
|||||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||||
\
|
\
|
||||||
return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||||
W, dst, dst_stride, sse); \
|
W, dst, dst_stride, sse); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
@ -451,7 +257,7 @@ unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
|
|||||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||||
\
|
\
|
||||||
return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
|
||||||
W, dst, dst_stride, sse); \
|
W, dst, dst_stride, sse); \
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -471,10 +277,10 @@ unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
|
|||||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||||
\
|
\
|
||||||
vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||||
\
|
\
|
||||||
return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
|
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
|
||||||
dst_stride, sse); \
|
dst_stride, sse); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
@ -493,10 +299,10 @@ unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
|
|||||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||||
\
|
\
|
||||||
vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||||
\
|
\
|
||||||
return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||||
W, dst, dst_stride, sse); \
|
W, dst, dst_stride, sse); \
|
||||||
} \
|
} \
|
||||||
\
|
\
|
||||||
@ -515,137 +321,49 @@ unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
|
|||||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||||
BILINEAR_FILTERS_2TAP(yoffset)); \
|
BILINEAR_FILTERS_2TAP(yoffset)); \
|
||||||
\
|
\
|
||||||
vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
|
||||||
CONVERT_TO_BYTEPTR(temp2), W); \
|
CONVERT_TO_BYTEPTR(temp2), W); \
|
||||||
\
|
\
|
||||||
return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
|
||||||
W, dst, dst_stride, sse); \
|
W, dst, dst_stride, sse); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define HIGHBD_GET_VAR(S) \
|
|
||||||
void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
|
||||||
const uint8_t *ref, int ref_stride, \
|
|
||||||
unsigned int *sse, int *sum) { \
|
|
||||||
highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
|
||||||
const uint8_t *ref, int ref_stride, \
|
|
||||||
unsigned int *sse, int *sum) { \
|
|
||||||
highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
|
||||||
const uint8_t *ref, int ref_stride, \
|
|
||||||
unsigned int *sse, int *sum) { \
|
|
||||||
highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
|
||||||
}
|
|
||||||
|
|
||||||
#define HIGHBD_MSE(W, H) \
|
|
||||||
unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \
|
|
||||||
int src_stride, \
|
|
||||||
const uint8_t *ref, \
|
|
||||||
int ref_stride, \
|
|
||||||
unsigned int *sse) { \
|
|
||||||
int sum; \
|
|
||||||
highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
|
||||||
return *sse; \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
|
|
||||||
int src_stride, \
|
|
||||||
const uint8_t *ref, \
|
|
||||||
int ref_stride, \
|
|
||||||
unsigned int *sse) { \
|
|
||||||
int sum; \
|
|
||||||
highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
|
||||||
return *sse; \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
|
|
||||||
int src_stride, \
|
|
||||||
const uint8_t *ref, \
|
|
||||||
int ref_stride, \
|
|
||||||
unsigned int *sse) { \
|
|
||||||
int sum; \
|
|
||||||
highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
|
||||||
return *sse; \
|
|
||||||
}
|
|
||||||
|
|
||||||
HIGHBD_GET_VAR(8)
|
|
||||||
HIGHBD_GET_VAR(16)
|
|
||||||
|
|
||||||
HIGHBD_MSE(16, 16)
|
|
||||||
HIGHBD_MSE(16, 8)
|
|
||||||
HIGHBD_MSE(8, 16)
|
|
||||||
HIGHBD_MSE(8, 8)
|
|
||||||
|
|
||||||
HIGHBD_VAR(4, 4)
|
|
||||||
HIGHBD_SUBPIX_VAR(4, 4)
|
HIGHBD_SUBPIX_VAR(4, 4)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(4, 4)
|
HIGHBD_SUBPIX_AVG_VAR(4, 4)
|
||||||
|
|
||||||
HIGHBD_VAR(4, 8)
|
|
||||||
HIGHBD_SUBPIX_VAR(4, 8)
|
HIGHBD_SUBPIX_VAR(4, 8)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(4, 8)
|
HIGHBD_SUBPIX_AVG_VAR(4, 8)
|
||||||
|
|
||||||
HIGHBD_VAR(8, 4)
|
|
||||||
HIGHBD_SUBPIX_VAR(8, 4)
|
HIGHBD_SUBPIX_VAR(8, 4)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(8, 4)
|
HIGHBD_SUBPIX_AVG_VAR(8, 4)
|
||||||
|
|
||||||
HIGHBD_VAR(8, 8)
|
|
||||||
HIGHBD_SUBPIX_VAR(8, 8)
|
HIGHBD_SUBPIX_VAR(8, 8)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(8, 8)
|
HIGHBD_SUBPIX_AVG_VAR(8, 8)
|
||||||
|
|
||||||
HIGHBD_VAR(8, 16)
|
|
||||||
HIGHBD_SUBPIX_VAR(8, 16)
|
HIGHBD_SUBPIX_VAR(8, 16)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(8, 16)
|
HIGHBD_SUBPIX_AVG_VAR(8, 16)
|
||||||
|
|
||||||
HIGHBD_VAR(16, 8)
|
|
||||||
HIGHBD_SUBPIX_VAR(16, 8)
|
HIGHBD_SUBPIX_VAR(16, 8)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(16, 8)
|
HIGHBD_SUBPIX_AVG_VAR(16, 8)
|
||||||
|
|
||||||
HIGHBD_VAR(16, 16)
|
|
||||||
HIGHBD_SUBPIX_VAR(16, 16)
|
HIGHBD_SUBPIX_VAR(16, 16)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(16, 16)
|
HIGHBD_SUBPIX_AVG_VAR(16, 16)
|
||||||
|
|
||||||
HIGHBD_VAR(16, 32)
|
|
||||||
HIGHBD_SUBPIX_VAR(16, 32)
|
HIGHBD_SUBPIX_VAR(16, 32)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(16, 32)
|
HIGHBD_SUBPIX_AVG_VAR(16, 32)
|
||||||
|
|
||||||
HIGHBD_VAR(32, 16)
|
|
||||||
HIGHBD_SUBPIX_VAR(32, 16)
|
HIGHBD_SUBPIX_VAR(32, 16)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(32, 16)
|
HIGHBD_SUBPIX_AVG_VAR(32, 16)
|
||||||
|
|
||||||
HIGHBD_VAR(32, 32)
|
|
||||||
HIGHBD_SUBPIX_VAR(32, 32)
|
HIGHBD_SUBPIX_VAR(32, 32)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(32, 32)
|
HIGHBD_SUBPIX_AVG_VAR(32, 32)
|
||||||
|
|
||||||
HIGHBD_VAR(32, 64)
|
|
||||||
HIGHBD_SUBPIX_VAR(32, 64)
|
HIGHBD_SUBPIX_VAR(32, 64)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(32, 64)
|
HIGHBD_SUBPIX_AVG_VAR(32, 64)
|
||||||
|
|
||||||
HIGHBD_VAR(64, 32)
|
|
||||||
HIGHBD_SUBPIX_VAR(64, 32)
|
HIGHBD_SUBPIX_VAR(64, 32)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(64, 32)
|
HIGHBD_SUBPIX_AVG_VAR(64, 32)
|
||||||
|
|
||||||
HIGHBD_VAR(64, 64)
|
|
||||||
HIGHBD_SUBPIX_VAR(64, 64)
|
HIGHBD_SUBPIX_VAR(64, 64)
|
||||||
HIGHBD_SUBPIX_AVG_VAR(64, 64)
|
HIGHBD_SUBPIX_AVG_VAR(64, 64)
|
||||||
|
|
||||||
void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
|
|
||||||
int width, int height, const uint8_t *ref8,
|
|
||||||
int ref_stride) {
|
|
||||||
int i, j;
|
|
||||||
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
|
||||||
for (i = 0; i < height; i++) {
|
|
||||||
for (j = 0; j < width; j++) {
|
|
||||||
const int tmp = pred[j] + ref[j];
|
|
||||||
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
|
||||||
}
|
|
||||||
comp_pred += width;
|
|
||||||
pred += width;
|
|
||||||
ref += ref_stride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
@ -12,31 +12,64 @@
|
|||||||
#define VP9_ENCODER_VP9_VARIANCE_H_
|
#define VP9_ENCODER_VP9_VARIANCE_H_
|
||||||
|
|
||||||
#include "vpx/vpx_integer.h"
|
#include "vpx/vpx_integer.h"
|
||||||
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void variance(const uint8_t *a, int a_stride,
|
// TODO(johannkoenig): All functions which depend on
|
||||||
const uint8_t *b, int b_stride,
|
// [highbd_][8|10|12_]variance should be refactored or moved to vpx_dsp.
|
||||||
int w, int h,
|
static void variance(const uint8_t *a, int a_stride,
|
||||||
unsigned int *sse, int *sum);
|
const uint8_t *b, int b_stride,
|
||||||
|
int w, int h, unsigned int *sse, int *sum) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
*sum = 0;
|
||||||
|
*sse = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < h; i++) {
|
||||||
|
for (j = 0; j < w; j++) {
|
||||||
|
const int diff = a[j] - b[j];
|
||||||
|
*sum += diff;
|
||||||
|
*sse += diff * diff;
|
||||||
|
}
|
||||||
|
|
||||||
|
a += a_stride;
|
||||||
|
b += b_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
void highbd_variance(const uint8_t *a8, int a_stride,
|
static void highbd_variance64(const uint8_t *a8, int a_stride,
|
||||||
const uint8_t *b8, int b_stride,
|
const uint8_t *b8, int b_stride,
|
||||||
int w, int h,
|
int w, int h, uint64_t *sse, uint64_t *sum) {
|
||||||
unsigned int *sse, int *sum);
|
int i, j;
|
||||||
|
|
||||||
void highbd_10_variance(const uint8_t *a8, int a_stride,
|
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||||
const uint8_t *b8, int b_stride,
|
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
||||||
int w, int h,
|
*sum = 0;
|
||||||
unsigned int *sse, int *sum);
|
*sse = 0;
|
||||||
|
|
||||||
void highbd_12_variance(const uint8_t *a8, int a_stride,
|
for (i = 0; i < h; i++) {
|
||||||
const uint8_t *b8, int b_stride,
|
for (j = 0; j < w; j++) {
|
||||||
int w, int h,
|
const int diff = a[j] - b[j];
|
||||||
unsigned int *sse, int *sum);
|
*sum += diff;
|
||||||
|
*sse += diff * diff;
|
||||||
|
}
|
||||||
|
a += a_stride;
|
||||||
|
b += b_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static void highbd_8_variance(const uint8_t *a8, int a_stride,
|
||||||
|
const uint8_t *b8, int b_stride,
|
||||||
|
int w, int h, unsigned int *sse, int *sum) {
|
||||||
|
uint64_t sse_long = 0;
|
||||||
|
uint64_t sum_long = 0;
|
||||||
|
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||||
|
*sse = (unsigned int)sse_long;
|
||||||
|
*sum = (int)sum_long;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
|
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
|
||||||
@ -95,15 +128,6 @@ typedef struct vp9_variance_vtable {
|
|||||||
vp9_sad_multi_d_fn_t sdx4df;
|
vp9_sad_multi_d_fn_t sdx4df;
|
||||||
} vp9_variance_fn_ptr_t;
|
} vp9_variance_fn_ptr_t;
|
||||||
|
|
||||||
void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
|
|
||||||
int height, const uint8_t *ref, int ref_stride);
|
|
||||||
|
|
||||||
#if CONFIG_VP9_HIGHBITDEPTH
|
|
||||||
void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred,
|
|
||||||
int width, int height,
|
|
||||||
const uint8_t *ref, int ref_stride);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
@ -13,237 +13,6 @@
|
|||||||
#include "vp9/encoder/vp9_variance.h"
|
#include "vp9/encoder/vp9_variance.h"
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
|
|
||||||
const uint16_t *ref, int ref_stride,
|
|
||||||
uint32_t *sse, int *sum);
|
|
||||||
|
|
||||||
uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
|
|
||||||
const uint16_t *ref, int ref_stride,
|
|
||||||
uint32_t *sse, int *sum);
|
|
||||||
|
|
||||||
uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
|
|
||||||
const uint16_t *ref, int ref_stride,
|
|
||||||
uint32_t *sse, int *sum);
|
|
||||||
|
|
||||||
static void highbd_variance_sse2(const uint16_t *src, int src_stride,
|
|
||||||
const uint16_t *ref, int ref_stride,
|
|
||||||
int w, int h, uint32_t *sse, int *sum,
|
|
||||||
high_variance_fn_t var_fn, int block_size) {
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
*sse = 0;
|
|
||||||
*sum = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < h; i += block_size) {
|
|
||||||
for (j = 0; j < w; j += block_size) {
|
|
||||||
unsigned int sse0;
|
|
||||||
int sum0;
|
|
||||||
var_fn(src + src_stride * i + j, src_stride,
|
|
||||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
|
||||||
*sse += sse0;
|
|
||||||
*sum += sum0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
|
|
||||||
const uint16_t *ref, int ref_stride,
|
|
||||||
int w, int h, uint32_t *sse, int *sum,
|
|
||||||
high_variance_fn_t var_fn, int block_size) {
|
|
||||||
int i, j;
|
|
||||||
uint64_t sse_long = 0;
|
|
||||||
int64_t sum_long = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < h; i += block_size) {
|
|
||||||
for (j = 0; j < w; j += block_size) {
|
|
||||||
unsigned int sse0;
|
|
||||||
int sum0;
|
|
||||||
var_fn(src + src_stride * i + j, src_stride,
|
|
||||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
|
||||||
sse_long += sse0;
|
|
||||||
sum_long += sum0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
|
|
||||||
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
|
|
||||||
const uint16_t *ref, int ref_stride,
|
|
||||||
int w, int h, uint32_t *sse, int *sum,
|
|
||||||
high_variance_fn_t var_fn, int block_size) {
|
|
||||||
int i, j;
|
|
||||||
uint64_t sse_long = 0;
|
|
||||||
int64_t sum_long = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < h; i += block_size) {
|
|
||||||
for (j = 0; j < w; j += block_size) {
|
|
||||||
unsigned int sse0;
|
|
||||||
int sum0;
|
|
||||||
var_fn(src + src_stride * i + j, src_stride,
|
|
||||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
|
||||||
sse_long += sse0;
|
|
||||||
sum_long += sum0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
|
|
||||||
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#define HIGH_GET_VAR(S) \
|
|
||||||
void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
|
||||||
const uint8_t *ref8, int ref_stride, \
|
|
||||||
uint32_t *sse, int *sum) { \
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
|
||||||
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
|
||||||
sse, sum); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
|
||||||
const uint8_t *ref8, int ref_stride, \
|
|
||||||
uint32_t *sse, int *sum) { \
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
|
||||||
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
|
||||||
sse, sum); \
|
|
||||||
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
|
|
||||||
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
|
||||||
const uint8_t *ref8, int ref_stride, \
|
|
||||||
uint32_t *sse, int *sum) { \
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
|
||||||
vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
|
||||||
sse, sum); \
|
|
||||||
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
|
|
||||||
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
|
|
||||||
}
|
|
||||||
|
|
||||||
HIGH_GET_VAR(16);
|
|
||||||
HIGH_GET_VAR(8);
|
|
||||||
|
|
||||||
#undef HIGH_GET_VAR
|
|
||||||
|
|
||||||
#define VAR_FN(w, h, block_size, shift) \
|
|
||||||
uint32_t vp9_highbd_variance##w##x##h##_sse2( \
|
|
||||||
const uint8_t *src8, int src_stride, \
|
|
||||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
|
||||||
int sum; \
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
|
||||||
highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
|
||||||
vp9_highbd_calc##block_size##x##block_size##var_sse2, \
|
|
||||||
block_size); \
|
|
||||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
|
|
||||||
const uint8_t *src8, int src_stride, \
|
|
||||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
|
||||||
int sum; \
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
|
||||||
highbd_10_variance_sse2( \
|
|
||||||
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
|
||||||
vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
|
||||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
|
||||||
} \
|
|
||||||
\
|
|
||||||
uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
|
|
||||||
const uint8_t *src8, int src_stride, \
|
|
||||||
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
|
||||||
int sum; \
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
|
||||||
highbd_12_variance_sse2( \
|
|
||||||
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
|
||||||
vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
|
||||||
return *sse - (((int64_t)sum * sum) >> shift); \
|
|
||||||
}
|
|
||||||
|
|
||||||
VAR_FN(64, 64, 16, 12);
|
|
||||||
VAR_FN(64, 32, 16, 11);
|
|
||||||
VAR_FN(32, 64, 16, 11);
|
|
||||||
VAR_FN(32, 32, 16, 10);
|
|
||||||
VAR_FN(32, 16, 16, 9);
|
|
||||||
VAR_FN(16, 32, 16, 9);
|
|
||||||
VAR_FN(16, 16, 16, 8);
|
|
||||||
VAR_FN(16, 8, 8, 7);
|
|
||||||
VAR_FN(8, 16, 8, 7);
|
|
||||||
VAR_FN(8, 8, 8, 6);
|
|
||||||
|
|
||||||
#undef VAR_FN
|
|
||||||
|
|
||||||
unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
|
||||||
const uint8_t *ref8, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
|
||||||
highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
|
||||||
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
|
||||||
const uint8_t *ref8, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
|
||||||
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
|
||||||
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
|
||||||
const uint8_t *ref8, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
|
||||||
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
|
||||||
sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
|
||||||
const uint8_t *ref8, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
|
||||||
highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
|
||||||
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
|
||||||
const uint8_t *ref8, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
|
||||||
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
|
||||||
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
|
||||||
const uint8_t *ref8, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
|
||||||
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
|
||||||
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
|
||||||
sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define DECL(w, opt) \
|
#define DECL(w, opt) \
|
||||||
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
|
int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
|
||||||
ptrdiff_t src_stride, \
|
ptrdiff_t src_stride, \
|
||||||
|
@ -13,18 +13,6 @@
|
|||||||
#include "vp9/encoder/vp9_variance.h"
|
#include "vp9/encoder/vp9_variance.h"
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum);
|
|
||||||
|
|
||||||
void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum);
|
|
||||||
|
|
||||||
void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum);
|
|
||||||
|
|
||||||
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
|
unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
|
||||||
int x_offset, int y_offset,
|
int x_offset, int y_offset,
|
||||||
const uint8_t *dst, int dst_stride,
|
const uint8_t *dst, int dst_stride,
|
||||||
@ -42,81 +30,6 @@ unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
|
|||||||
int height,
|
int height,
|
||||||
unsigned int *sseptr);
|
unsigned int *sseptr);
|
||||||
|
|
||||||
static void variance_avx2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
int w, int h, unsigned int *sse, int *sum,
|
|
||||||
get_var_avx2 var_fn, int block_size) {
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
*sse = 0;
|
|
||||||
*sum = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < h; i += 16) {
|
|
||||||
for (j = 0; j < w; j += block_size) {
|
|
||||||
unsigned int sse0;
|
|
||||||
int sum0;
|
|
||||||
var_fn(&src[src_stride * i + j], src_stride,
|
|
||||||
&ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
|
|
||||||
*sse += sse0;
|
|
||||||
*sum += sum0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
|
|
||||||
sse, &sum, vp9_get16x16var_avx2, 16);
|
|
||||||
return *sse - (((unsigned int)sum * sum) >> 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
|
|
||||||
sse, &sum, vp9_get32x32var_avx2, 32);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 9);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
|
|
||||||
sse, &sum, vp9_get32x32var_avx2, 32);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
|
|
||||||
sse, &sum, vp9_get32x32var_avx2, 32);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 12);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
|
|
||||||
sse, &sum, vp9_get32x32var_avx2, 32);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 11);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
|
unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
|
||||||
int src_stride,
|
int src_stride,
|
||||||
int x_offset,
|
int x_offset,
|
||||||
|
@ -16,299 +16,6 @@
|
|||||||
#include "vp9/encoder/vp9_variance.h"
|
#include "vp9/encoder/vp9_variance.h"
|
||||||
#include "vpx_ports/mem.h"
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
typedef void (*variance_fn_t)(const unsigned char *src, int src_stride,
|
|
||||||
const unsigned char *ref, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum);
|
|
||||||
|
|
||||||
unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
|
|
||||||
__m128i vsum = _mm_setzero_si128();
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < 32; ++i) {
|
|
||||||
const __m128i v = _mm_loadu_si128((const __m128i *)src);
|
|
||||||
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
|
|
||||||
src += 8;
|
|
||||||
}
|
|
||||||
|
|
||||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
|
||||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
|
||||||
return _mm_cvtsi128_si32(vsum);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define READ64(p, stride, i) \
|
|
||||||
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
|
|
||||||
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
|
|
||||||
|
|
||||||
static void get4x4var_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum) {
|
|
||||||
const __m128i zero = _mm_setzero_si128();
|
|
||||||
const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
|
|
||||||
const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
|
|
||||||
const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
|
|
||||||
const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
|
|
||||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
|
||||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
|
||||||
|
|
||||||
// sum
|
|
||||||
__m128i vsum = _mm_add_epi16(diff0, diff1);
|
|
||||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
|
||||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
|
||||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
|
|
||||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
|
|
||||||
|
|
||||||
// sse
|
|
||||||
vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
|
|
||||||
_mm_madd_epi16(diff1, diff1));
|
|
||||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
|
||||||
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
|
||||||
*sse = _mm_cvtsi128_si32(vsum);
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum) {
|
|
||||||
const __m128i zero = _mm_setzero_si128();
|
|
||||||
__m128i vsum = _mm_setzero_si128();
|
|
||||||
__m128i vsse = _mm_setzero_si128();
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < 8; i += 2) {
|
|
||||||
const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
|
||||||
(const __m128i *)(src + i * src_stride)), zero);
|
|
||||||
const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
|
||||||
(const __m128i *)(ref + i * ref_stride)), zero);
|
|
||||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
|
||||||
|
|
||||||
const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
|
||||||
(const __m128i *)(src + (i + 1) * src_stride)), zero);
|
|
||||||
const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
|
||||||
(const __m128i *)(ref + (i + 1) * ref_stride)), zero);
|
|
||||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
|
||||||
|
|
||||||
vsum = _mm_add_epi16(vsum, diff0);
|
|
||||||
vsum = _mm_add_epi16(vsum, diff1);
|
|
||||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
|
||||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum
|
|
||||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
|
||||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
|
||||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
|
|
||||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
|
|
||||||
|
|
||||||
// sse
|
|
||||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
|
||||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
|
||||||
*sse = _mm_cvtsi128_si32(vsse);
|
|
||||||
}
|
|
||||||
|
|
||||||
void vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse, int *sum) {
|
|
||||||
const __m128i zero = _mm_setzero_si128();
|
|
||||||
__m128i vsum = _mm_setzero_si128();
|
|
||||||
__m128i vsse = _mm_setzero_si128();
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < 16; ++i) {
|
|
||||||
const __m128i s = _mm_loadu_si128((const __m128i *)src);
|
|
||||||
const __m128i r = _mm_loadu_si128((const __m128i *)ref);
|
|
||||||
|
|
||||||
const __m128i src0 = _mm_unpacklo_epi8(s, zero);
|
|
||||||
const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
|
|
||||||
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
|
||||||
|
|
||||||
const __m128i src1 = _mm_unpackhi_epi8(s, zero);
|
|
||||||
const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
|
|
||||||
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
|
||||||
|
|
||||||
vsum = _mm_add_epi16(vsum, diff0);
|
|
||||||
vsum = _mm_add_epi16(vsum, diff1);
|
|
||||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
|
||||||
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
|
||||||
|
|
||||||
src += src_stride;
|
|
||||||
ref += ref_stride;
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum
|
|
||||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
|
||||||
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
|
||||||
*sum = (int16_t)_mm_extract_epi16(vsum, 0) +
|
|
||||||
(int16_t)_mm_extract_epi16(vsum, 1);
|
|
||||||
|
|
||||||
// sse
|
|
||||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
|
||||||
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
|
||||||
*sse = _mm_cvtsi128_si32(vsse);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void variance_sse2(const unsigned char *src, int src_stride,
|
|
||||||
const unsigned char *ref, int ref_stride,
|
|
||||||
int w, int h, unsigned int *sse, int *sum,
|
|
||||||
variance_fn_t var_fn, int block_size) {
|
|
||||||
int i, j;
|
|
||||||
|
|
||||||
*sse = 0;
|
|
||||||
*sum = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < h; i += block_size) {
|
|
||||||
for (j = 0; j < w; j += block_size) {
|
|
||||||
unsigned int sse0;
|
|
||||||
int sum0;
|
|
||||||
var_fn(src + src_stride * i + j, src_stride,
|
|
||||||
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
|
||||||
*sse += sse0;
|
|
||||||
*sum += sum0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
|
|
||||||
const unsigned char *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
|
||||||
return *sse - (((unsigned int)sum * sum) >> 4);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
|
|
||||||
sse, &sum, get4x4var_sse2, 4);
|
|
||||||
return *sse - (((unsigned int)sum * sum) >> 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
|
|
||||||
sse, &sum, get4x4var_sse2, 4);
|
|
||||||
return *sse - (((unsigned int)sum * sum) >> 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
|
|
||||||
const unsigned char *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
|
||||||
return *sse - (((unsigned int)sum * sum) >> 6);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
|
|
||||||
const unsigned char *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
|
|
||||||
sse, &sum, vp9_get8x8var_sse2, 8);
|
|
||||||
return *sse - (((unsigned int)sum * sum) >> 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
|
|
||||||
const unsigned char *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
|
|
||||||
sse, &sum, vp9_get8x8var_sse2, 8);
|
|
||||||
return *sse - (((unsigned int)sum * sum) >> 7);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
|
|
||||||
const unsigned char *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
|
||||||
return *sse - (((unsigned int)sum * sum) >> 8);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
|
|
||||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
|
|
||||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 9);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
|
|
||||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 9);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
|
|
||||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 12);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
|
|
||||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 11);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
int sum;
|
|
||||||
variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
|
|
||||||
sse, &sum, vp9_get16x16var_sse2, 16);
|
|
||||||
return *sse - (((int64_t)sum * sum) >> 11);
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
|
|
||||||
const uint8_t *ref, int ref_stride,
|
|
||||||
unsigned int *sse) {
|
|
||||||
vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
|
|
||||||
return *sse;
|
|
||||||
}
|
|
||||||
|
|
||||||
// The 2 unused parameters are place holders for PIC enabled build.
|
// The 2 unused parameters are place holders for PIC enabled build.
|
||||||
#define DECL(w, opt) \
|
#define DECL(w, opt) \
|
||||||
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
|
int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
|
||||||
|
@ -102,13 +102,11 @@ VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
|
|||||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
||||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
||||||
|
|
||||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
|
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
|
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
|
||||||
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
|
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
|
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
|
||||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
|
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
|
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
|
||||||
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
|
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
|
||||||
endif
|
endif
|
||||||
|
363
vpx_dsp/arm/variance_media.asm
Normal file
363
vpx_dsp/arm/variance_media.asm
Normal file
@ -0,0 +1,363 @@
|
|||||||
|
;
|
||||||
|
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||||
|
;
|
||||||
|
; Use of this source code is governed by a BSD-style license
|
||||||
|
; that can be found in the LICENSE file in the root of the source
|
||||||
|
; tree. An additional intellectual property rights grant can be found
|
||||||
|
; in the file PATENTS. All contributing project authors may
|
||||||
|
; be found in the AUTHORS file in the root of the source tree.
|
||||||
|
;
|
||||||
|
|
||||||
|
|
||||||
|
EXPORT |vpx_variance16x16_media|
|
||||||
|
EXPORT |vpx_variance8x8_media|
|
||||||
|
EXPORT |vpx_mse16x16_media|
|
||||||
|
|
||||||
|
ARM
|
||||||
|
REQUIRE8
|
||||||
|
PRESERVE8
|
||||||
|
|
||||||
|
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||||
|
|
||||||
|
; r0 unsigned char *src_ptr
|
||||||
|
; r1 int source_stride
|
||||||
|
; r2 unsigned char *ref_ptr
|
||||||
|
; r3 int recon_stride
|
||||||
|
; stack unsigned int *sse
|
||||||
|
|vpx_variance16x16_media| PROC
|
||||||
|
|
||||||
|
stmfd sp!, {r4-r12, lr}
|
||||||
|
|
||||||
|
pld [r0, r1, lsl #0]
|
||||||
|
pld [r2, r3, lsl #0]
|
||||||
|
|
||||||
|
mov r8, #0 ; initialize sum = 0
|
||||||
|
mov r11, #0 ; initialize sse = 0
|
||||||
|
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||||
|
|
||||||
|
loop16x16
|
||||||
|
; 1st 4 pixels
|
||||||
|
ldr r4, [r0, #0] ; load 4 src pixels
|
||||||
|
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||||
|
|
||||||
|
mov lr, #0 ; constant zero
|
||||||
|
|
||||||
|
usub8 r6, r4, r5 ; calculate difference
|
||||||
|
pld [r0, r1, lsl #1]
|
||||||
|
sel r7, r6, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||||
|
pld [r2, r3, lsl #1]
|
||||||
|
sel r6, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||||
|
orr r6, r6, r7 ; differences of all 4 pixels
|
||||||
|
; calculate total sum
|
||||||
|
adds r8, r8, r4 ; add positive differences to sum
|
||||||
|
subs r8, r8, r5 ; subtract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 2nd 4 pixels
|
||||||
|
ldr r4, [r0, #4] ; load 4 src pixels
|
||||||
|
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||||
|
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r6, r4, r5 ; calculate difference
|
||||||
|
sel r7, r6, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||||
|
sel r6, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||||
|
orr r6, r6, r7 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
; calculate total sum
|
||||||
|
add r8, r8, r4 ; add positive differences to sum
|
||||||
|
sub r8, r8, r5 ; subtract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 3rd 4 pixels
|
||||||
|
ldr r4, [r0, #8] ; load 4 src pixels
|
||||||
|
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||||
|
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r6, r4, r5 ; calculate difference
|
||||||
|
sel r7, r6, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||||
|
sel r6, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||||
|
orr r6, r6, r7 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
; calculate total sum
|
||||||
|
add r8, r8, r4 ; add positive differences to sum
|
||||||
|
sub r8, r8, r5 ; subtract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 4th 4 pixels
|
||||||
|
ldr r4, [r0, #12] ; load 4 src pixels
|
||||||
|
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||||
|
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r6, r4, r5 ; calculate difference
|
||||||
|
add r0, r0, r1 ; set src_ptr to next row
|
||||||
|
sel r7, r6, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||||
|
add r2, r2, r3 ; set dst_ptr to next row
|
||||||
|
sel r6, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||||
|
orr r6, r6, r7 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
; calculate total sum
|
||||||
|
add r8, r8, r4 ; add positive differences to sum
|
||||||
|
sub r8, r8, r5 ; subtract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||||
|
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
|
||||||
|
subs r12, r12, #1
|
||||||
|
|
||||||
|
bne loop16x16
|
||||||
|
|
||||||
|
; return stuff
|
||||||
|
ldr r6, [sp, #40] ; get address of sse
|
||||||
|
mul r0, r8, r8 ; sum * sum
|
||||||
|
str r11, [r6] ; store sse
|
||||||
|
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||||
|
|
||||||
|
ldmfd sp!, {r4-r12, pc}
|
||||||
|
|
||||||
|
ENDP
|
||||||
|
|
||||||
|
END
|
||||||
|
|
||||||
|
|
||||||
|
; r0 unsigned char *src_ptr
|
||||||
|
; r1 int source_stride
|
||||||
|
; r2 unsigned char *ref_ptr
|
||||||
|
; r3 int recon_stride
|
||||||
|
; stack unsigned int *sse
|
||||||
|
|vpx_variance8x8_media| PROC
|
||||||
|
|
||||||
|
push {r4-r10, lr}
|
||||||
|
|
||||||
|
pld [r0, r1, lsl #0]
|
||||||
|
pld [r2, r3, lsl #0]
|
||||||
|
|
||||||
|
mov r12, #8 ; set loop counter to 8 (=block height)
|
||||||
|
mov r4, #0 ; initialize sum = 0
|
||||||
|
mov r5, #0 ; initialize sse = 0
|
||||||
|
|
||||||
|
loop8x8
|
||||||
|
; 1st 4 pixels
|
||||||
|
ldr r6, [r0, #0x0] ; load 4 src pixels
|
||||||
|
ldr r7, [r2, #0x0] ; load 4 ref pixels
|
||||||
|
|
||||||
|
mov lr, #0 ; constant zero
|
||||||
|
|
||||||
|
usub8 r8, r6, r7 ; calculate difference
|
||||||
|
pld [r0, r1, lsl #1]
|
||||||
|
sel r10, r8, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||||
|
pld [r2, r3, lsl #1]
|
||||||
|
sel r8, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||||
|
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||||
|
orr r8, r8, r10 ; differences of all 4 pixels
|
||||||
|
; calculate total sum
|
||||||
|
add r4, r4, r6 ; add positive differences to sum
|
||||||
|
sub r4, r4, r7 ; subtract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 2nd 4 pixels
|
||||||
|
ldr r6, [r0, #0x4] ; load 4 src pixels
|
||||||
|
ldr r7, [r2, #0x4] ; load 4 ref pixels
|
||||||
|
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r8, r6, r7 ; calculate difference
|
||||||
|
add r0, r0, r1 ; set src_ptr to next row
|
||||||
|
sel r10, r8, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||||
|
add r2, r2, r3 ; set dst_ptr to next row
|
||||||
|
sel r8, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||||
|
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||||
|
orr r8, r8, r10 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
; calculate total sum
|
||||||
|
add r4, r4, r6 ; add positive differences to sum
|
||||||
|
sub r4, r4, r7 ; subtract negative differences from sum
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||||
|
subs r12, r12, #1 ; next row
|
||||||
|
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
bne loop8x8
|
||||||
|
|
||||||
|
; return stuff
|
||||||
|
ldr r8, [sp, #32] ; get address of sse
|
||||||
|
mul r1, r4, r4 ; sum * sum
|
||||||
|
str r5, [r8] ; store sse
|
||||||
|
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
|
||||||
|
|
||||||
|
pop {r4-r10, pc}
|
||||||
|
|
||||||
|
ENDP
|
||||||
|
|
||||||
|
END
|
||||||
|
|
||||||
|
; r0 unsigned char *src_ptr
|
||||||
|
; r1 int source_stride
|
||||||
|
; r2 unsigned char *ref_ptr
|
||||||
|
; r3 int recon_stride
|
||||||
|
; stack unsigned int *sse
|
||||||
|
;
|
||||||
|
;note: Based on vpx_variance16x16_media. In this function, sum is never used.
|
||||||
|
; So, we can remove this part of calculation.
|
||||||
|
|
||||||
|
|vpx_mse16x16_media| PROC
|
||||||
|
|
||||||
|
push {r4-r9, lr}
|
||||||
|
|
||||||
|
pld [r0, r1, lsl #0]
|
||||||
|
pld [r2, r3, lsl #0]
|
||||||
|
|
||||||
|
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||||
|
mov r4, #0 ; initialize sse = 0
|
||||||
|
|
||||||
|
loopmse
|
||||||
|
; 1st 4 pixels
|
||||||
|
ldr r5, [r0, #0x0] ; load 4 src pixels
|
||||||
|
ldr r6, [r2, #0x0] ; load 4 ref pixels
|
||||||
|
|
||||||
|
mov lr, #0 ; constant zero
|
||||||
|
|
||||||
|
usub8 r8, r5, r6 ; calculate difference
|
||||||
|
pld [r0, r1, lsl #1]
|
||||||
|
sel r7, r8, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||||
|
pld [r2, r3, lsl #1]
|
||||||
|
sel r8, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||||
|
orr r8, r8, r7 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
ldr r5, [r0, #0x4] ; load 4 src pixels
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 2nd 4 pixels
|
||||||
|
ldr r6, [r2, #0x4] ; load 4 ref pixels
|
||||||
|
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r8, r5, r6 ; calculate difference
|
||||||
|
sel r7, r8, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||||
|
sel r8, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||||
|
orr r8, r8, r7 ; differences of all 4 pixels
|
||||||
|
ldr r5, [r0, #0x8] ; load 4 src pixels
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 3rd 4 pixels
|
||||||
|
ldr r6, [r2, #0x8] ; load 4 ref pixels
|
||||||
|
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r8, r5, r6 ; calculate difference
|
||||||
|
sel r7, r8, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||||
|
sel r8, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||||
|
orr r8, r8, r7 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
ldr r5, [r0, #0xc] ; load 4 src pixels
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||||
|
|
||||||
|
; 4th 4 pixels
|
||||||
|
ldr r6, [r2, #0xc] ; load 4 ref pixels
|
||||||
|
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
usub8 r8, r5, r6 ; calculate difference
|
||||||
|
add r0, r0, r1 ; set src_ptr to next row
|
||||||
|
sel r7, r8, lr ; select bytes with positive difference
|
||||||
|
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||||
|
add r2, r2, r3 ; set dst_ptr to next row
|
||||||
|
sel r8, r9, lr ; select bytes with negative difference
|
||||||
|
|
||||||
|
; calculate partial sums
|
||||||
|
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||||
|
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||||
|
orr r8, r8, r7 ; differences of all 4 pixels
|
||||||
|
|
||||||
|
subs r12, r12, #1 ; next row
|
||||||
|
|
||||||
|
; calculate sse
|
||||||
|
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||||
|
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||||
|
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||||
|
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||||
|
|
||||||
|
bne loopmse
|
||||||
|
|
||||||
|
; return stuff
|
||||||
|
ldr r1, [sp, #28] ; get address of sse
|
||||||
|
mov r0, r4 ; return sse
|
||||||
|
str r4, [r1] ; store sse
|
||||||
|
|
||||||
|
pop {r4-r9, pc}
|
||||||
|
|
||||||
|
ENDP
|
||||||
|
|
||||||
|
END
|
417
vpx_dsp/arm/variance_neon.c
Normal file
417
vpx_dsp/arm/variance_neon.c
Normal file
@ -0,0 +1,417 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
|
#include "./vpx_config.h"
|
||||||
|
|
||||||
|
#include "vpx/vpx_integer.h"
|
||||||
|
|
||||||
|
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
|
||||||
|
const int32x4_t a = vpaddlq_s16(v_16x8);
|
||||||
|
const int64x2_t b = vpaddlq_s32(a);
|
||||||
|
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
|
||||||
|
vreinterpret_s32_s64(vget_high_s64(b)));
|
||||||
|
return vget_lane_s32(c, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
|
||||||
|
const int64x2_t b = vpaddlq_s32(v_32x4);
|
||||||
|
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
|
||||||
|
vreinterpret_s32_s64(vget_high_s64(b)));
|
||||||
|
return vget_lane_s32(c, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// w * h must be less than 2048 or local variable v_sum may overflow.
|
||||||
|
static void variance_neon_w8(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
int w, int h, uint32_t *sse, int *sum) {
|
||||||
|
int i, j;
|
||||||
|
int16x8_t v_sum = vdupq_n_s16(0);
|
||||||
|
int32x4_t v_sse_lo = vdupq_n_s32(0);
|
||||||
|
int32x4_t v_sse_hi = vdupq_n_s32(0);
|
||||||
|
|
||||||
|
for (i = 0; i < h; ++i) {
|
||||||
|
for (j = 0; j < w; j += 8) {
|
||||||
|
const uint8x8_t v_a = vld1_u8(&a[j]);
|
||||||
|
const uint8x8_t v_b = vld1_u8(&b[j]);
|
||||||
|
const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
|
||||||
|
const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
|
||||||
|
v_sum = vaddq_s16(v_sum, sv_diff);
|
||||||
|
v_sse_lo = vmlal_s16(v_sse_lo,
|
||||||
|
vget_low_s16(sv_diff),
|
||||||
|
vget_low_s16(sv_diff));
|
||||||
|
v_sse_hi = vmlal_s16(v_sse_hi,
|
||||||
|
vget_high_s16(sv_diff),
|
||||||
|
vget_high_s16(sv_diff));
|
||||||
|
}
|
||||||
|
a += a_stride;
|
||||||
|
b += b_stride;
|
||||||
|
}
|
||||||
|
|
||||||
|
*sum = horizontal_add_s16x8(v_sum);
|
||||||
|
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
|
||||||
|
}
|
||||||
|
|
||||||
|
void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
unsigned int *sse, int *sum) {
|
||||||
|
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
unsigned int *sse, int *sum) {
|
||||||
|
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 6); // >> 6 = / 8 * 8
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 8); // >> 8 = / 16 * 16
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 10); // >> 10 = / 32 * 32
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum1, sum2;
|
||||||
|
uint32_t sse1, sse2;
|
||||||
|
variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
|
||||||
|
variance_neon_w8(a + (32 * a_stride), a_stride,
|
||||||
|
b + (32 * b_stride), b_stride, 32, 32,
|
||||||
|
&sse2, &sum2);
|
||||||
|
*sse = sse1 + sse2;
|
||||||
|
sum1 += sum2;
|
||||||
|
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum1, sum2;
|
||||||
|
uint32_t sse1, sse2;
|
||||||
|
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
|
||||||
|
variance_neon_w8(a + (16 * a_stride), a_stride,
|
||||||
|
b + (16 * b_stride), b_stride, 64, 16,
|
||||||
|
&sse2, &sum2);
|
||||||
|
*sse = sse1 + sse2;
|
||||||
|
sum1 += sum2;
|
||||||
|
return *sse - (((int64_t)sum1 * sum1) >> 11); // >> 11 = / 32 * 64
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum1, sum2;
|
||||||
|
uint32_t sse1, sse2;
|
||||||
|
|
||||||
|
variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
|
||||||
|
variance_neon_w8(a + (16 * a_stride), a_stride,
|
||||||
|
b + (16 * b_stride), b_stride, 64, 16,
|
||||||
|
&sse2, &sum2);
|
||||||
|
sse1 += sse2;
|
||||||
|
sum1 += sum2;
|
||||||
|
|
||||||
|
variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
|
||||||
|
b + (16 * 2 * b_stride), b_stride,
|
||||||
|
64, 16, &sse2, &sum2);
|
||||||
|
sse1 += sse2;
|
||||||
|
sum1 += sum2;
|
||||||
|
|
||||||
|
variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
|
||||||
|
b + (16 * 3 * b_stride), b_stride,
|
||||||
|
64, 16, &sse2, &sum2);
|
||||||
|
*sse = sse1 + sse2;
|
||||||
|
sum1 += sum2;
|
||||||
|
return *sse - (((int64_t)sum1 * sum1) >> 12); // >> 12 = / 64 * 64
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance16x8_neon(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int source_stride,
|
||||||
|
const unsigned char *ref_ptr,
|
||||||
|
int recon_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int i;
|
||||||
|
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||||
|
uint32x2_t d0u32, d10u32;
|
||||||
|
int64x1_t d0s64, d1s64;
|
||||||
|
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||||
|
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||||
|
int32x4_t q8s32, q9s32, q10s32;
|
||||||
|
int64x2_t q0s64, q1s64, q5s64;
|
||||||
|
|
||||||
|
q8s32 = vdupq_n_s32(0);
|
||||||
|
q9s32 = vdupq_n_s32(0);
|
||||||
|
q10s32 = vdupq_n_s32(0);
|
||||||
|
|
||||||
|
for (i = 0; i < 4; i++) {
|
||||||
|
q0u8 = vld1q_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
q1u8 = vld1q_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
__builtin_prefetch(src_ptr);
|
||||||
|
|
||||||
|
q2u8 = vld1q_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
q3u8 = vld1q_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
__builtin_prefetch(ref_ptr);
|
||||||
|
|
||||||
|
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||||
|
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||||
|
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||||
|
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||||
|
|
||||||
|
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||||
|
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||||
|
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||||
|
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||||
|
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||||
|
|
||||||
|
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||||
|
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||||
|
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||||
|
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||||
|
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||||
|
|
||||||
|
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||||
|
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||||
|
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
|
||||||
|
q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
|
||||||
|
q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
|
||||||
|
|
||||||
|
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||||
|
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||||
|
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
|
||||||
|
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||||
|
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||||
|
}
|
||||||
|
|
||||||
|
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||||
|
q0s64 = vpaddlq_s32(q8s32);
|
||||||
|
q1s64 = vpaddlq_s32(q10s32);
|
||||||
|
|
||||||
|
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||||
|
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||||
|
|
||||||
|
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||||
|
vreinterpret_s32_s64(d0s64));
|
||||||
|
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||||
|
|
||||||
|
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||||
|
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||||
|
|
||||||
|
return vget_lane_u32(d0u32, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance8x16_neon(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int source_stride,
|
||||||
|
const unsigned char *ref_ptr,
|
||||||
|
int recon_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int i;
|
||||||
|
uint8x8_t d0u8, d2u8, d4u8, d6u8;
|
||||||
|
int16x4_t d22s16, d23s16, d24s16, d25s16;
|
||||||
|
uint32x2_t d0u32, d10u32;
|
||||||
|
int64x1_t d0s64, d1s64;
|
||||||
|
uint16x8_t q11u16, q12u16;
|
||||||
|
int32x4_t q8s32, q9s32, q10s32;
|
||||||
|
int64x2_t q0s64, q1s64, q5s64;
|
||||||
|
|
||||||
|
q8s32 = vdupq_n_s32(0);
|
||||||
|
q9s32 = vdupq_n_s32(0);
|
||||||
|
q10s32 = vdupq_n_s32(0);
|
||||||
|
|
||||||
|
for (i = 0; i < 8; i++) {
|
||||||
|
d0u8 = vld1_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
d2u8 = vld1_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
__builtin_prefetch(src_ptr);
|
||||||
|
|
||||||
|
d4u8 = vld1_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
d6u8 = vld1_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
__builtin_prefetch(ref_ptr);
|
||||||
|
|
||||||
|
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||||
|
q12u16 = vsubl_u8(d2u8, d6u8);
|
||||||
|
|
||||||
|
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||||
|
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||||
|
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
|
||||||
|
q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
|
||||||
|
q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
|
||||||
|
|
||||||
|
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||||
|
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||||
|
q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
|
||||||
|
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||||
|
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||||
|
}
|
||||||
|
|
||||||
|
q10s32 = vaddq_s32(q10s32, q9s32);
|
||||||
|
q0s64 = vpaddlq_s32(q8s32);
|
||||||
|
q1s64 = vpaddlq_s32(q10s32);
|
||||||
|
|
||||||
|
d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
|
||||||
|
d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||||
|
|
||||||
|
q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
|
||||||
|
vreinterpret_s32_s64(d0s64));
|
||||||
|
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
|
||||||
|
|
||||||
|
d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
|
||||||
|
d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
|
||||||
|
|
||||||
|
return vget_lane_u32(d0u32, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_mse16x16_neon(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int source_stride,
|
||||||
|
const unsigned char *ref_ptr,
|
||||||
|
int recon_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int i;
|
||||||
|
int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
|
||||||
|
int64x1_t d0s64;
|
||||||
|
uint8x16_t q0u8, q1u8, q2u8, q3u8;
|
||||||
|
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
||||||
|
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||||
|
int64x2_t q1s64;
|
||||||
|
|
||||||
|
q7s32 = vdupq_n_s32(0);
|
||||||
|
q8s32 = vdupq_n_s32(0);
|
||||||
|
q9s32 = vdupq_n_s32(0);
|
||||||
|
q10s32 = vdupq_n_s32(0);
|
||||||
|
|
||||||
|
for (i = 0; i < 8; i++) { // mse16x16_neon_loop
|
||||||
|
q0u8 = vld1q_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
q1u8 = vld1q_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
q2u8 = vld1q_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
q3u8 = vld1q_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
|
||||||
|
q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
|
||||||
|
q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
|
||||||
|
q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
|
||||||
|
q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
|
||||||
|
|
||||||
|
d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
|
||||||
|
d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
|
||||||
|
q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
|
||||||
|
q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
|
||||||
|
|
||||||
|
d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
|
||||||
|
d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
|
||||||
|
q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
|
||||||
|
q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
|
||||||
|
|
||||||
|
d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
|
||||||
|
d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
|
||||||
|
q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
|
||||||
|
q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
|
||||||
|
|
||||||
|
d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
|
||||||
|
d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
|
||||||
|
q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
|
||||||
|
q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
|
||||||
|
}
|
||||||
|
|
||||||
|
q7s32 = vaddq_s32(q7s32, q8s32);
|
||||||
|
q9s32 = vaddq_s32(q9s32, q10s32);
|
||||||
|
q10s32 = vaddq_s32(q7s32, q9s32);
|
||||||
|
|
||||||
|
q1s64 = vpaddlq_s32(q10s32);
|
||||||
|
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||||
|
|
||||||
|
vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
|
||||||
|
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_get4x4sse_cs_neon(
|
||||||
|
const unsigned char *src_ptr,
|
||||||
|
int source_stride,
|
||||||
|
const unsigned char *ref_ptr,
|
||||||
|
int recon_stride) {
|
||||||
|
int16x4_t d22s16, d24s16, d26s16, d28s16;
|
||||||
|
int64x1_t d0s64;
|
||||||
|
uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
|
||||||
|
int32x4_t q7s32, q8s32, q9s32, q10s32;
|
||||||
|
uint16x8_t q11u16, q12u16, q13u16, q14u16;
|
||||||
|
int64x2_t q1s64;
|
||||||
|
|
||||||
|
d0u8 = vld1_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
d4u8 = vld1_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
d1u8 = vld1_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
d5u8 = vld1_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
d2u8 = vld1_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
d6u8 = vld1_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
d3u8 = vld1_u8(src_ptr);
|
||||||
|
src_ptr += source_stride;
|
||||||
|
d7u8 = vld1_u8(ref_ptr);
|
||||||
|
ref_ptr += recon_stride;
|
||||||
|
|
||||||
|
q11u16 = vsubl_u8(d0u8, d4u8);
|
||||||
|
q12u16 = vsubl_u8(d1u8, d5u8);
|
||||||
|
q13u16 = vsubl_u8(d2u8, d6u8);
|
||||||
|
q14u16 = vsubl_u8(d3u8, d7u8);
|
||||||
|
|
||||||
|
d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
|
||||||
|
d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
|
||||||
|
d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
|
||||||
|
d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
|
||||||
|
|
||||||
|
q7s32 = vmull_s16(d22s16, d22s16);
|
||||||
|
q8s32 = vmull_s16(d24s16, d24s16);
|
||||||
|
q9s32 = vmull_s16(d26s16, d26s16);
|
||||||
|
q10s32 = vmull_s16(d28s16, d28s16);
|
||||||
|
|
||||||
|
q7s32 = vaddq_s32(q7s32, q8s32);
|
||||||
|
q9s32 = vaddq_s32(q9s32, q10s32);
|
||||||
|
q9s32 = vaddq_s32(q7s32, q9s32);
|
||||||
|
|
||||||
|
q1s64 = vpaddlq_s32(q9s32);
|
||||||
|
d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
|
||||||
|
|
||||||
|
return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
|
||||||
|
}
|
@ -33,6 +33,7 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride,
|
|||||||
return sad;
|
return sad;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
|
||||||
/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
|
/* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
|
||||||
* The function averages every corresponding element of the buffers and stores
|
* The function averages every corresponding element of the buffers and stores
|
||||||
* the value in a third buffer, comp_pred.
|
* the value in a third buffer, comp_pred.
|
||||||
|
306
vpx_dsp/variance.c
Normal file
306
vpx_dsp/variance.c
Normal file
@ -0,0 +1,306 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "./vpx_config.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
|
|
||||||
|
#include "vpx_ports/mem.h"
|
||||||
|
#include "vpx/vpx_integer.h"
|
||||||
|
|
||||||
|
unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int a_stride,
|
||||||
|
const unsigned char *b, int b_stride) {
|
||||||
|
int distortion = 0;
|
||||||
|
int r, c;
|
||||||
|
|
||||||
|
for (r = 0; r < 4; r++) {
|
||||||
|
for (c = 0; c < 4; c++) {
|
||||||
|
int diff = a[c] - b[c];
|
||||||
|
distortion += diff * diff;
|
||||||
|
}
|
||||||
|
|
||||||
|
a += a_stride;
|
||||||
|
b += b_stride;
|
||||||
|
}
|
||||||
|
|
||||||
|
return distortion;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_get_mb_ss_c(const int16_t *a) {
|
||||||
|
unsigned int i, sum = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < 256; ++i) {
|
||||||
|
sum += a[i] * a[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void variance(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
int w, int h, unsigned int *sse, int *sum) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
*sum = 0;
|
||||||
|
*sse = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < h; i++) {
|
||||||
|
for (j = 0; j < w; j++) {
|
||||||
|
const int diff = a[j] - b[j];
|
||||||
|
*sum += diff;
|
||||||
|
*sse += diff * diff;
|
||||||
|
}
|
||||||
|
|
||||||
|
a += a_stride;
|
||||||
|
b += b_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define VAR(W, H) \
|
||||||
|
unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||||
|
const uint8_t *b, int b_stride, \
|
||||||
|
unsigned int *sse) { \
|
||||||
|
int sum; \
|
||||||
|
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||||
|
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Identical to the variance call except it takes an additional parameter, sum,
|
||||||
|
* and returns that value using pass-by-reference instead of returning
|
||||||
|
* sse - sum^2 / w*h
|
||||||
|
*/
|
||||||
|
#define GET_VAR(W, H) \
|
||||||
|
void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
|
||||||
|
const uint8_t *b, int b_stride, \
|
||||||
|
unsigned int *sse, int *sum) { \
|
||||||
|
variance(a, a_stride, b, b_stride, W, H, sse, sum); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Identical to the variance call except it does not calculate the
|
||||||
|
* sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
|
||||||
|
* variable.
|
||||||
|
*/
|
||||||
|
#define MSE(W, H) \
|
||||||
|
unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
|
||||||
|
const uint8_t *b, int b_stride, \
|
||||||
|
unsigned int *sse) { \
|
||||||
|
int sum; \
|
||||||
|
variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||||
|
return *sse; \
|
||||||
|
}
|
||||||
|
|
||||||
|
VAR(64, 64)
|
||||||
|
VAR(64, 32)
|
||||||
|
VAR(32, 64)
|
||||||
|
VAR(32, 32)
|
||||||
|
VAR(32, 16)
|
||||||
|
VAR(16, 32)
|
||||||
|
VAR(16, 16)
|
||||||
|
VAR(16, 8)
|
||||||
|
VAR(8, 16)
|
||||||
|
VAR(8, 8)
|
||||||
|
VAR(8, 4)
|
||||||
|
VAR(4, 8)
|
||||||
|
VAR(4, 4)
|
||||||
|
|
||||||
|
GET_VAR(16, 16)
|
||||||
|
GET_VAR(8, 8)
|
||||||
|
|
||||||
|
MSE(16, 16)
|
||||||
|
MSE(16, 8)
|
||||||
|
MSE(8, 16)
|
||||||
|
MSE(8, 8)
|
||||||
|
|
||||||
|
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
|
||||||
|
int height, const uint8_t *ref, int ref_stride) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
for (i = 0; i < height; i++) {
|
||||||
|
for (j = 0; j < width; j++) {
|
||||||
|
const int tmp = pred[j] + ref[j];
|
||||||
|
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
||||||
|
}
|
||||||
|
comp_pred += width;
|
||||||
|
pred += width;
|
||||||
|
ref += ref_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
static void highbd_variance64(const uint8_t *a8, int a_stride,
|
||||||
|
const uint8_t *b8, int b_stride,
|
||||||
|
int w, int h, uint64_t *sse, uint64_t *sum) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||||
|
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
||||||
|
*sum = 0;
|
||||||
|
*sse = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < h; i++) {
|
||||||
|
for (j = 0; j < w; j++) {
|
||||||
|
const int diff = a[j] - b[j];
|
||||||
|
*sum += diff;
|
||||||
|
*sse += diff * diff;
|
||||||
|
}
|
||||||
|
a += a_stride;
|
||||||
|
b += b_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void highbd_8_variance(const uint8_t *a8, int a_stride,
|
||||||
|
const uint8_t *b8, int b_stride,
|
||||||
|
int w, int h, unsigned int *sse, int *sum) {
|
||||||
|
uint64_t sse_long = 0;
|
||||||
|
uint64_t sum_long = 0;
|
||||||
|
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||||
|
*sse = (unsigned int)sse_long;
|
||||||
|
*sum = (int)sum_long;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void highbd_10_variance(const uint8_t *a8, int a_stride,
|
||||||
|
const uint8_t *b8, int b_stride,
|
||||||
|
int w, int h, unsigned int *sse, int *sum) {
|
||||||
|
uint64_t sse_long = 0;
|
||||||
|
uint64_t sum_long = 0;
|
||||||
|
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||||
|
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
|
||||||
|
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void highbd_12_variance(const uint8_t *a8, int a_stride,
|
||||||
|
const uint8_t *b8, int b_stride,
|
||||||
|
int w, int h, unsigned int *sse, int *sum) {
|
||||||
|
uint64_t sse_long = 0;
|
||||||
|
uint64_t sum_long = 0;
|
||||||
|
highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
|
||||||
|
*sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
|
||||||
|
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HIGHBD_VAR(W, H) \
|
||||||
|
unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
|
||||||
|
int a_stride, \
|
||||||
|
const uint8_t *b, \
|
||||||
|
int b_stride, \
|
||||||
|
unsigned int *sse) { \
|
||||||
|
int sum; \
|
||||||
|
highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||||
|
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
|
||||||
|
int a_stride, \
|
||||||
|
const uint8_t *b, \
|
||||||
|
int b_stride, \
|
||||||
|
unsigned int *sse) { \
|
||||||
|
int sum; \
|
||||||
|
highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||||
|
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
|
||||||
|
int a_stride, \
|
||||||
|
const uint8_t *b, \
|
||||||
|
int b_stride, \
|
||||||
|
unsigned int *sse) { \
|
||||||
|
int sum; \
|
||||||
|
highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
|
||||||
|
return *sse - (((int64_t)sum * sum) / (W * H)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HIGHBD_GET_VAR(S) \
|
||||||
|
void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||||
|
const uint8_t *ref, int ref_stride, \
|
||||||
|
unsigned int *sse, int *sum) { \
|
||||||
|
highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||||
|
const uint8_t *ref, int ref_stride, \
|
||||||
|
unsigned int *sse, int *sum) { \
|
||||||
|
highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
|
||||||
|
const uint8_t *ref, int ref_stride, \
|
||||||
|
unsigned int *sse, int *sum) { \
|
||||||
|
highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define HIGHBD_MSE(W, H) \
|
||||||
|
unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
|
||||||
|
int src_stride, \
|
||||||
|
const uint8_t *ref, \
|
||||||
|
int ref_stride, \
|
||||||
|
unsigned int *sse) { \
|
||||||
|
int sum; \
|
||||||
|
highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||||
|
return *sse; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
|
||||||
|
int src_stride, \
|
||||||
|
const uint8_t *ref, \
|
||||||
|
int ref_stride, \
|
||||||
|
unsigned int *sse) { \
|
||||||
|
int sum; \
|
||||||
|
highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||||
|
return *sse; \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
|
||||||
|
int src_stride, \
|
||||||
|
const uint8_t *ref, \
|
||||||
|
int ref_stride, \
|
||||||
|
unsigned int *sse) { \
|
||||||
|
int sum; \
|
||||||
|
highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
|
||||||
|
return *sse; \
|
||||||
|
}
|
||||||
|
|
||||||
|
HIGHBD_GET_VAR(8)
|
||||||
|
HIGHBD_GET_VAR(16)
|
||||||
|
|
||||||
|
HIGHBD_MSE(16, 16)
|
||||||
|
HIGHBD_MSE(16, 8)
|
||||||
|
HIGHBD_MSE(8, 16)
|
||||||
|
HIGHBD_MSE(8, 8)
|
||||||
|
|
||||||
|
HIGHBD_VAR(64, 64)
|
||||||
|
HIGHBD_VAR(64, 32)
|
||||||
|
HIGHBD_VAR(32, 64)
|
||||||
|
HIGHBD_VAR(32, 32)
|
||||||
|
HIGHBD_VAR(32, 16)
|
||||||
|
HIGHBD_VAR(16, 32)
|
||||||
|
HIGHBD_VAR(16, 16)
|
||||||
|
HIGHBD_VAR(16, 8)
|
||||||
|
HIGHBD_VAR(8, 16)
|
||||||
|
HIGHBD_VAR(8, 8)
|
||||||
|
HIGHBD_VAR(8, 4)
|
||||||
|
HIGHBD_VAR(4, 8)
|
||||||
|
HIGHBD_VAR(4, 4)
|
||||||
|
|
||||||
|
void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
|
||||||
|
int width, int height, const uint8_t *ref8,
|
||||||
|
int ref_stride) {
|
||||||
|
int i, j;
|
||||||
|
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||||
|
for (i = 0; i < height; i++) {
|
||||||
|
for (j = 0; j < width; j++) {
|
||||||
|
const int tmp = pred[j] + ref[j];
|
||||||
|
comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
|
||||||
|
}
|
||||||
|
comp_pred += width;
|
||||||
|
pred += width;
|
||||||
|
ref += ref_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // CONFIG_VP9_HIGHBITDEPTH
|
@ -17,6 +17,7 @@ DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
|
|||||||
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
|
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
|
||||||
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
|
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
|
||||||
|
|
||||||
|
|
||||||
DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm
|
DSP_SRCS-$(HAVE_MMX) += x86/sad_mmx.asm
|
||||||
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
|
DSP_SRCS-$(HAVE_SSE2) += x86/sad4d_sse2.asm
|
||||||
DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
|
DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm
|
||||||
@ -29,9 +30,28 @@ DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
|
|||||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
|
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
|
||||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
|
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
|
||||||
|
|
||||||
endif # CONFIG_VP9_HIGHBITDEPTH
|
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||||
endif # CONFIG_ENCODERS
|
endif # CONFIG_ENCODERS
|
||||||
|
|
||||||
|
ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
|
||||||
|
DSP_SRCS-yes += variance.c
|
||||||
|
|
||||||
|
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
|
||||||
|
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
|
||||||
|
|
||||||
|
DSP_SRCS-$(HAVE_MMX) += x86/variance_mmx.c
|
||||||
|
DSP_SRCS-$(HAVE_MMX) += x86/variance_impl_mmx.asm
|
||||||
|
DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c
|
||||||
|
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
|
||||||
|
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
|
||||||
|
|
||||||
|
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||||
|
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
|
||||||
|
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
|
||||||
|
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
|
||||||
|
|
||||||
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
|
DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
|
||||||
|
|
||||||
DSP_SRCS-yes += vpx_dsp_rtcd.c
|
DSP_SRCS-yes += vpx_dsp_rtcd.c
|
||||||
|
@ -392,4 +392,212 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||||||
} # CONFIG_VP9_HIGHBITDEPTH
|
} # CONFIG_VP9_HIGHBITDEPTH
|
||||||
} # CONFIG_ENCODERS
|
} # CONFIG_ENCODERS
|
||||||
|
|
||||||
|
if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance64x64 sse2 avx2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance64x32 sse2 avx2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance32x64 sse2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance32x32 sse2 avx2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance32x16 sse2 avx2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance16x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance16x8 mmx sse2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance8x16 mmx sse2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance8x8 mmx sse2 media neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance8x4 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance4x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_variance4x4 mmx sse2/;
|
||||||
|
|
||||||
|
|
||||||
|
add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||||
|
specialize qw/vpx_get16x16var sse2 avx2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||||
|
specialize qw/vpx_get8x8var mmx sse2 neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_mse16x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_mse8x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_mse8x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
|
||||||
|
specialize qw/vpx_get_mb_ss mmx sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
|
||||||
|
specialize qw/vpx_get4x4sse_cs neon/;
|
||||||
|
|
||||||
|
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
|
||||||
|
|
||||||
|
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance64x64 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance64x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance32x64 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance32x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance32x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance16x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance16x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance16x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance8x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_variance8x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance64x64 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance64x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance32x64 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance32x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance32x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance16x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance16x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance16x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance8x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_variance8x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance64x64 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance64x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance32x64 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance32x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance32x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance16x32 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance16x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance16x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance8x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_variance8x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||||
|
|
||||||
|
add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||||
|
add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||||
|
|
||||||
|
add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||||
|
add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||||
|
|
||||||
|
add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||||
|
add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_mse16x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_8_mse8x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_mse16x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_10_mse8x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_mse16x16 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||||
|
specialize qw/vpx_highbd_12_mse8x8 sse2/;
|
||||||
|
|
||||||
|
add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
|
||||||
|
} # CONFIG_VP9_HIGHBITDEPTH
|
||||||
|
} # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
|
||||||
|
|
||||||
1;
|
1;
|
||||||
|
@ -11,7 +11,7 @@
|
|||||||
|
|
||||||
%include "vpx_ports/x86_abi_support.asm"
|
%include "vpx_ports/x86_abi_support.asm"
|
||||||
|
|
||||||
;unsigned int vp9_highbd_calc16x16var_sse2
|
;unsigned int vpx_highbd_calc16x16var_sse2
|
||||||
;(
|
;(
|
||||||
; unsigned char * src_ptr,
|
; unsigned char * src_ptr,
|
||||||
; int source_stride,
|
; int source_stride,
|
||||||
@ -20,8 +20,8 @@
|
|||||||
; unsigned int * SSE,
|
; unsigned int * SSE,
|
||||||
; int * Sum
|
; int * Sum
|
||||||
;)
|
;)
|
||||||
global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
|
global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
|
||||||
sym(vp9_highbd_calc16x16var_sse2):
|
sym(vpx_highbd_calc16x16var_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
SHADOW_ARGS_TO_STACK 6
|
SHADOW_ARGS_TO_STACK 6
|
||||||
@ -164,7 +164,7 @@ sym(vp9_highbd_calc16x16var_sse2):
|
|||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
||||||
;unsigned int vp9_highbd_calc8x8var_sse2
|
;unsigned int vpx_highbd_calc8x8var_sse2
|
||||||
;(
|
;(
|
||||||
; unsigned char * src_ptr,
|
; unsigned char * src_ptr,
|
||||||
; int source_stride,
|
; int source_stride,
|
||||||
@ -173,8 +173,8 @@ sym(vp9_highbd_calc16x16var_sse2):
|
|||||||
; unsigned int * SSE,
|
; unsigned int * SSE,
|
||||||
; int * Sum
|
; int * Sum
|
||||||
;)
|
;)
|
||||||
global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
|
global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
|
||||||
sym(vp9_highbd_calc8x8var_sse2):
|
sym(vpx_highbd_calc8x8var_sse2):
|
||||||
push rbp
|
push rbp
|
||||||
mov rbp, rsp
|
mov rbp, rsp
|
||||||
SHADOW_ARGS_TO_STACK 6
|
SHADOW_ARGS_TO_STACK 6
|
245
vpx_dsp/x86/highbd_variance_sse2.c
Normal file
245
vpx_dsp/x86/highbd_variance_sse2.c
Normal file
@ -0,0 +1,245 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2014 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
#include "./vpx_config.h"
|
||||||
|
#include "vp9/common/vp9_common.h"
|
||||||
|
|
||||||
|
#include "vp9/encoder/vp9_variance.h"
|
||||||
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
|
typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
|
||||||
|
const uint16_t *ref, int ref_stride,
|
||||||
|
uint32_t *sse, int *sum);
|
||||||
|
|
||||||
|
uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
|
||||||
|
const uint16_t *ref, int ref_stride,
|
||||||
|
uint32_t *sse, int *sum);
|
||||||
|
|
||||||
|
uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
|
||||||
|
const uint16_t *ref, int ref_stride,
|
||||||
|
uint32_t *sse, int *sum);
|
||||||
|
|
||||||
|
static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
|
||||||
|
const uint16_t *ref, int ref_stride,
|
||||||
|
int w, int h, uint32_t *sse, int *sum,
|
||||||
|
high_variance_fn_t var_fn, int block_size) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
*sse = 0;
|
||||||
|
*sum = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < h; i += block_size) {
|
||||||
|
for (j = 0; j < w; j += block_size) {
|
||||||
|
unsigned int sse0;
|
||||||
|
int sum0;
|
||||||
|
var_fn(src + src_stride * i + j, src_stride,
|
||||||
|
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||||
|
*sse += sse0;
|
||||||
|
*sum += sum0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
|
||||||
|
const uint16_t *ref, int ref_stride,
|
||||||
|
int w, int h, uint32_t *sse, int *sum,
|
||||||
|
high_variance_fn_t var_fn, int block_size) {
|
||||||
|
int i, j;
|
||||||
|
uint64_t sse_long = 0;
|
||||||
|
int64_t sum_long = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < h; i += block_size) {
|
||||||
|
for (j = 0; j < w; j += block_size) {
|
||||||
|
unsigned int sse0;
|
||||||
|
int sum0;
|
||||||
|
var_fn(src + src_stride * i + j, src_stride,
|
||||||
|
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||||
|
sse_long += sse0;
|
||||||
|
sum_long += sum0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*sum = ROUND_POWER_OF_TWO(sum_long, 2);
|
||||||
|
*sse = ROUND_POWER_OF_TWO(sse_long, 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
|
||||||
|
const uint16_t *ref, int ref_stride,
|
||||||
|
int w, int h, uint32_t *sse, int *sum,
|
||||||
|
high_variance_fn_t var_fn, int block_size) {
|
||||||
|
int i, j;
|
||||||
|
uint64_t sse_long = 0;
|
||||||
|
int64_t sum_long = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < h; i += block_size) {
|
||||||
|
for (j = 0; j < w; j += block_size) {
|
||||||
|
unsigned int sse0;
|
||||||
|
int sum0;
|
||||||
|
var_fn(src + src_stride * i + j, src_stride,
|
||||||
|
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||||
|
sse_long += sse0;
|
||||||
|
sum_long += sum0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*sum = ROUND_POWER_OF_TWO(sum_long, 4);
|
||||||
|
*sse = ROUND_POWER_OF_TWO(sse_long, 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define HIGH_GET_VAR(S) \
|
||||||
|
void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||||
|
const uint8_t *ref8, int ref_stride, \
|
||||||
|
uint32_t *sse, int *sum) { \
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||||
|
vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||||
|
sse, sum); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||||
|
const uint8_t *ref8, int ref_stride, \
|
||||||
|
uint32_t *sse, int *sum) { \
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||||
|
vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||||
|
sse, sum); \
|
||||||
|
*sum = ROUND_POWER_OF_TWO(*sum, 2); \
|
||||||
|
*sse = ROUND_POWER_OF_TWO(*sse, 4); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
|
||||||
|
const uint8_t *ref8, int ref_stride, \
|
||||||
|
uint32_t *sse, int *sum) { \
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||||
|
vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
|
||||||
|
sse, sum); \
|
||||||
|
*sum = ROUND_POWER_OF_TWO(*sum, 4); \
|
||||||
|
*sse = ROUND_POWER_OF_TWO(*sse, 8); \
|
||||||
|
}
|
||||||
|
|
||||||
|
HIGH_GET_VAR(16);
|
||||||
|
HIGH_GET_VAR(8);
|
||||||
|
|
||||||
|
#undef HIGH_GET_VAR
|
||||||
|
|
||||||
|
#define VAR_FN(w, h, block_size, shift) \
|
||||||
|
uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
|
||||||
|
const uint8_t *src8, int src_stride, \
|
||||||
|
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||||
|
int sum; \
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||||
|
highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||||
|
vpx_highbd_calc##block_size##x##block_size##var_sse2, \
|
||||||
|
block_size); \
|
||||||
|
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
|
||||||
|
const uint8_t *src8, int src_stride, \
|
||||||
|
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||||
|
int sum; \
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||||
|
highbd_10_variance_sse2( \
|
||||||
|
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||||
|
vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
||||||
|
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
|
||||||
|
const uint8_t *src8, int src_stride, \
|
||||||
|
const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
|
||||||
|
int sum; \
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
|
||||||
|
highbd_12_variance_sse2( \
|
||||||
|
src, src_stride, ref, ref_stride, w, h, sse, &sum, \
|
||||||
|
vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
|
||||||
|
return *sse - (((int64_t)sum * sum) >> shift); \
|
||||||
|
}
|
||||||
|
|
||||||
|
VAR_FN(64, 64, 16, 12);
|
||||||
|
VAR_FN(64, 32, 16, 11);
|
||||||
|
VAR_FN(32, 64, 16, 11);
|
||||||
|
VAR_FN(32, 32, 16, 10);
|
||||||
|
VAR_FN(32, 16, 16, 9);
|
||||||
|
VAR_FN(16, 32, 16, 9);
|
||||||
|
VAR_FN(16, 16, 16, 8);
|
||||||
|
VAR_FN(16, 8, 8, 7);
|
||||||
|
VAR_FN(8, 16, 8, 7);
|
||||||
|
VAR_FN(8, 8, 8, 6);
|
||||||
|
|
||||||
|
#undef VAR_FN
|
||||||
|
|
||||||
|
unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||||
|
const uint8_t *ref8, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||||
|
highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||||
|
sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
|
||||||
|
return *sse;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||||
|
const uint8_t *ref8, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||||
|
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||||
|
sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
|
||||||
|
return *sse;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
|
||||||
|
const uint8_t *ref8, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||||
|
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
|
||||||
|
sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
|
||||||
|
return *sse;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||||
|
const uint8_t *ref8, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||||
|
highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||||
|
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
|
||||||
|
return *sse;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||||
|
const uint8_t *ref8, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||||
|
highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||||
|
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
|
||||||
|
return *sse;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
|
||||||
|
const uint8_t *ref8, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
uint16_t *src = CONVERT_TO_SHORTPTR(src8);
|
||||||
|
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
|
||||||
|
highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
|
||||||
|
sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
|
||||||
|
return *sse;
|
||||||
|
}
|
93
vpx_dsp/x86/variance_avx2.c
Normal file
93
vpx_dsp/x86/variance_avx2.c
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2012 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
|
|
||||||
|
typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse, int *sum);
|
||||||
|
|
||||||
|
void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse, int *sum);
|
||||||
|
|
||||||
|
static void variance_avx2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
int w, int h, unsigned int *sse, int *sum,
|
||||||
|
get_var_avx2 var_fn, int block_size) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
*sse = 0;
|
||||||
|
*sum = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < h; i += 16) {
|
||||||
|
for (j = 0; j < w; j += block_size) {
|
||||||
|
unsigned int sse0;
|
||||||
|
int sum0;
|
||||||
|
var_fn(&src[src_stride * i + j], src_stride,
|
||||||
|
&ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
|
||||||
|
*sse += sse0;
|
||||||
|
*sum += sum0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
|
||||||
|
sse, &sum, vpx_get16x16var_avx2, 16);
|
||||||
|
return *sse - (((unsigned int)sum * sum) >> 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||||
|
return *sse;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
|
||||||
|
sse, &sum, vpx_get32x32var_avx2, 32);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 9);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
|
||||||
|
sse, &sum, vpx_get32x32var_avx2, 32);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
|
||||||
|
sse, &sum, vpx_get32x32var_avx2, 32);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 12);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
|
||||||
|
sse, &sum, vpx_get32x32var_avx2, 32);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 11);
|
||||||
|
}
|
@ -10,9 +10,9 @@
|
|||||||
|
|
||||||
#include <immintrin.h> // AVX2
|
#include <immintrin.h> // AVX2
|
||||||
|
|
||||||
#include "./vp9_rtcd.h"
|
#include "./vpx_dsp_rtcd.h"
|
||||||
|
|
||||||
void vp9_get16x16var_avx2(const unsigned char *src_ptr,
|
void vpx_get16x16var_avx2(const unsigned char *src_ptr,
|
||||||
int source_stride,
|
int source_stride,
|
||||||
const unsigned char *ref_ptr,
|
const unsigned char *ref_ptr,
|
||||||
int recon_stride,
|
int recon_stride,
|
||||||
@ -123,7 +123,7 @@ void vp9_get16x16var_avx2(const unsigned char *src_ptr,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_get32x32var_avx2(const unsigned char *src_ptr,
|
void vpx_get32x32var_avx2(const unsigned char *src_ptr,
|
||||||
int source_stride,
|
int source_stride,
|
||||||
const unsigned char *ref_ptr,
|
const unsigned char *ref_ptr,
|
||||||
int recon_stride,
|
int recon_stride,
|
424
vpx_dsp/x86/variance_impl_mmx.asm
Normal file
424
vpx_dsp/x86/variance_impl_mmx.asm
Normal file
@ -0,0 +1,424 @@
|
|||||||
|
;
|
||||||
|
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||||
|
;
|
||||||
|
; Use of this source code is governed by a BSD-style license
|
||||||
|
; that can be found in the LICENSE file in the root of the source
|
||||||
|
; tree. An additional intellectual property rights grant can be found
|
||||||
|
; in the file PATENTS. All contributing project authors may
|
||||||
|
; be found in the AUTHORS file in the root of the source tree.
|
||||||
|
;
|
||||||
|
|
||||||
|
|
||||||
|
%include "vpx_ports/x86_abi_support.asm"
|
||||||
|
|
||||||
|
;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
|
||||||
|
global sym(vpx_get_mb_ss_mmx) PRIVATE
|
||||||
|
sym(vpx_get_mb_ss_mmx):
|
||||||
|
push rbp
|
||||||
|
mov rbp, rsp
|
||||||
|
SHADOW_ARGS_TO_STACK 7
|
||||||
|
GET_GOT rbx
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
sub rsp, 8
|
||||||
|
; end prolog
|
||||||
|
|
||||||
|
mov rax, arg(0) ;src_ptr
|
||||||
|
mov rcx, 16
|
||||||
|
pxor mm4, mm4
|
||||||
|
|
||||||
|
.NEXTROW:
|
||||||
|
movq mm0, [rax]
|
||||||
|
movq mm1, [rax+8]
|
||||||
|
movq mm2, [rax+16]
|
||||||
|
movq mm3, [rax+24]
|
||||||
|
pmaddwd mm0, mm0
|
||||||
|
pmaddwd mm1, mm1
|
||||||
|
pmaddwd mm2, mm2
|
||||||
|
pmaddwd mm3, mm3
|
||||||
|
|
||||||
|
paddd mm4, mm0
|
||||||
|
paddd mm4, mm1
|
||||||
|
paddd mm4, mm2
|
||||||
|
paddd mm4, mm3
|
||||||
|
|
||||||
|
add rax, 32
|
||||||
|
dec rcx
|
||||||
|
ja .NEXTROW
|
||||||
|
movq QWORD PTR [rsp], mm4
|
||||||
|
|
||||||
|
;return sum[0]+sum[1];
|
||||||
|
movsxd rax, dword ptr [rsp]
|
||||||
|
movsxd rcx, dword ptr [rsp+4]
|
||||||
|
add rax, rcx
|
||||||
|
|
||||||
|
|
||||||
|
; begin epilog
|
||||||
|
add rsp, 8
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
RESTORE_GOT
|
||||||
|
UNSHADOW_ARGS
|
||||||
|
pop rbp
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
;void vpx_get8x8var_mmx
|
||||||
|
;(
|
||||||
|
; unsigned char *src_ptr,
|
||||||
|
; int source_stride,
|
||||||
|
; unsigned char *ref_ptr,
|
||||||
|
; int recon_stride,
|
||||||
|
; unsigned int *SSE,
|
||||||
|
; int *Sum
|
||||||
|
;)
|
||||||
|
global sym(vpx_get8x8var_mmx) PRIVATE
|
||||||
|
sym(vpx_get8x8var_mmx):
|
||||||
|
push rbp
|
||||||
|
mov rbp, rsp
|
||||||
|
SHADOW_ARGS_TO_STACK 6
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push rbx
|
||||||
|
sub rsp, 16
|
||||||
|
; end prolog
|
||||||
|
|
||||||
|
|
||||||
|
pxor mm5, mm5 ; Blank mmx6
|
||||||
|
pxor mm6, mm6 ; Blank mmx7
|
||||||
|
pxor mm7, mm7 ; Blank mmx7
|
||||||
|
|
||||||
|
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||||
|
mov rbx, arg(2) ;[ref_ptr]
|
||||||
|
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||||
|
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||||
|
|
||||||
|
; Row 1
|
||||||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||||
|
movq mm2, mm0 ; Take copies
|
||||||
|
movq mm3, mm1 ; Take copies
|
||||||
|
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||||
|
punpckhbw mm3, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||||
|
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
paddd mm7, mm2 ; accumulate in mm7
|
||||||
|
|
||||||
|
|
||||||
|
; Row 2
|
||||||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||||
|
movq mm2, mm0 ; Take copies
|
||||||
|
movq mm3, mm1 ; Take copies
|
||||||
|
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||||
|
punpckhbw mm3, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||||
|
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
paddd mm7, mm2 ; accumulate in mm7
|
||||||
|
|
||||||
|
; Row 3
|
||||||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||||
|
movq mm2, mm0 ; Take copies
|
||||||
|
movq mm3, mm1 ; Take copies
|
||||||
|
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||||
|
punpckhbw mm3, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||||
|
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
paddd mm7, mm2 ; accumulate in mm7
|
||||||
|
|
||||||
|
; Row 4
|
||||||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||||
|
movq mm2, mm0 ; Take copies
|
||||||
|
movq mm3, mm1 ; Take copies
|
||||||
|
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||||
|
punpckhbw mm3, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||||
|
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
paddd mm7, mm2 ; accumulate in mm7
|
||||||
|
|
||||||
|
; Row 5
|
||||||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||||
|
movq mm2, mm0 ; Take copies
|
||||||
|
movq mm3, mm1 ; Take copies
|
||||||
|
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||||
|
punpckhbw mm3, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||||
|
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||||
|
; movq mm4, [rbx + rdx]
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
paddd mm7, mm2 ; accumulate in mm7
|
||||||
|
|
||||||
|
; Row 6
|
||||||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||||
|
movq mm2, mm0 ; Take copies
|
||||||
|
movq mm3, mm1 ; Take copies
|
||||||
|
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||||
|
punpckhbw mm3, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||||
|
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
paddd mm7, mm2 ; accumulate in mm7
|
||||||
|
|
||||||
|
; Row 7
|
||||||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||||
|
movq mm2, mm0 ; Take copies
|
||||||
|
movq mm3, mm1 ; Take copies
|
||||||
|
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||||
|
punpckhbw mm3, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||||
|
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movq mm1, [rbx] ; Copy eight bytes to mm1
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
paddd mm7, mm2 ; accumulate in mm7
|
||||||
|
|
||||||
|
; Row 8
|
||||||
|
movq mm0, [rax] ; Copy eight bytes to mm0
|
||||||
|
movq mm2, mm0 ; Take copies
|
||||||
|
movq mm3, mm1 ; Take copies
|
||||||
|
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
punpckhbw mm2, mm6 ; unpack to higher prrcision
|
||||||
|
punpckhbw mm3, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
psubsw mm2, mm3 ; A-B (high order) to MM2
|
||||||
|
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
paddw mm5, mm2 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
pmaddwd mm2, mm2 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
paddd mm7, mm2 ; accumulate in mm7
|
||||||
|
|
||||||
|
; Now accumulate the final results.
|
||||||
|
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||||||
|
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||||||
|
movsx rdx, WORD PTR [rsp+8]
|
||||||
|
movsx rcx, WORD PTR [rsp+10]
|
||||||
|
movsx rbx, WORD PTR [rsp+12]
|
||||||
|
movsx rax, WORD PTR [rsp+14]
|
||||||
|
add rdx, rcx
|
||||||
|
add rbx, rax
|
||||||
|
add rdx, rbx ;XSum
|
||||||
|
movsxd rax, DWORD PTR [rsp]
|
||||||
|
movsxd rcx, DWORD PTR [rsp+4]
|
||||||
|
add rax, rcx ;XXSum
|
||||||
|
mov rsi, arg(4) ;SSE
|
||||||
|
mov rdi, arg(5) ;Sum
|
||||||
|
mov dword ptr [rsi], eax
|
||||||
|
mov dword ptr [rdi], edx
|
||||||
|
xor rax, rax ; return 0
|
||||||
|
|
||||||
|
|
||||||
|
; begin epilog
|
||||||
|
add rsp, 16
|
||||||
|
pop rbx
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
UNSHADOW_ARGS
|
||||||
|
pop rbp
|
||||||
|
ret
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
;void
|
||||||
|
;vpx_get4x4var_mmx
|
||||||
|
;(
|
||||||
|
; unsigned char *src_ptr,
|
||||||
|
; int source_stride,
|
||||||
|
; unsigned char *ref_ptr,
|
||||||
|
; int recon_stride,
|
||||||
|
; unsigned int *SSE,
|
||||||
|
; int *Sum
|
||||||
|
;)
|
||||||
|
global sym(vpx_get4x4var_mmx) PRIVATE
|
||||||
|
sym(vpx_get4x4var_mmx):
|
||||||
|
push rbp
|
||||||
|
mov rbp, rsp
|
||||||
|
SHADOW_ARGS_TO_STACK 6
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push rbx
|
||||||
|
sub rsp, 16
|
||||||
|
; end prolog
|
||||||
|
|
||||||
|
|
||||||
|
pxor mm5, mm5 ; Blank mmx6
|
||||||
|
pxor mm6, mm6 ; Blank mmx7
|
||||||
|
pxor mm7, mm7 ; Blank mmx7
|
||||||
|
|
||||||
|
mov rax, arg(0) ;[src_ptr] ; Load base addresses
|
||||||
|
mov rbx, arg(2) ;[ref_ptr]
|
||||||
|
movsxd rcx, dword ptr arg(1) ;[source_stride]
|
||||||
|
movsxd rdx, dword ptr arg(3) ;[recon_stride]
|
||||||
|
|
||||||
|
; Row 1
|
||||||
|
movd mm0, [rax] ; Copy four bytes to mm0
|
||||||
|
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
|
||||||
|
|
||||||
|
; Row 2
|
||||||
|
movd mm0, [rax] ; Copy four bytes to mm0
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
|
||||||
|
; Row 3
|
||||||
|
movd mm0, [rax] ; Copy four bytes to mm0
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher precision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
add rbx,rdx ; Inc pointer into ref data
|
||||||
|
add rax,rcx ; Inc pointer into the new data
|
||||||
|
movd mm1, [rbx] ; Copy four bytes to mm1
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
|
||||||
|
; Row 4
|
||||||
|
movd mm0, [rax] ; Copy four bytes to mm0
|
||||||
|
|
||||||
|
punpcklbw mm0, mm6 ; unpack to higher prrcision
|
||||||
|
punpcklbw mm1, mm6
|
||||||
|
psubsw mm0, mm1 ; A-B (low order) to MM0
|
||||||
|
|
||||||
|
paddw mm5, mm0 ; accumulate differences in mm5
|
||||||
|
|
||||||
|
pmaddwd mm0, mm0 ; square and accumulate
|
||||||
|
paddd mm7, mm0 ; accumulate in mm7
|
||||||
|
|
||||||
|
|
||||||
|
; Now accumulate the final results.
|
||||||
|
movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory
|
||||||
|
movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory
|
||||||
|
movsx rdx, WORD PTR [rsp+8]
|
||||||
|
movsx rcx, WORD PTR [rsp+10]
|
||||||
|
movsx rbx, WORD PTR [rsp+12]
|
||||||
|
movsx rax, WORD PTR [rsp+14]
|
||||||
|
add rdx, rcx
|
||||||
|
add rbx, rax
|
||||||
|
add rdx, rbx ;XSum
|
||||||
|
movsxd rax, DWORD PTR [rsp]
|
||||||
|
movsxd rcx, DWORD PTR [rsp+4]
|
||||||
|
add rax, rcx ;XXSum
|
||||||
|
mov rsi, arg(4) ;SSE
|
||||||
|
mov rdi, arg(5) ;Sum
|
||||||
|
mov dword ptr [rsi], eax
|
||||||
|
mov dword ptr [rdi], edx
|
||||||
|
xor rax, rax ; return 0
|
||||||
|
|
||||||
|
|
||||||
|
; begin epilog
|
||||||
|
add rsp, 16
|
||||||
|
pop rbx
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
UNSHADOW_ARGS
|
||||||
|
pop rbp
|
||||||
|
ret
|
107
vpx_dsp/x86/variance_mmx.c
Normal file
107
vpx_dsp/x86/variance_mmx.c
Normal file
@ -0,0 +1,107 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
|
|
||||||
|
extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
|
||||||
|
const uint8_t *b, int b_stride,
|
||||||
|
unsigned int *sse, int *sum);
|
||||||
|
|
||||||
|
unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
|
||||||
|
const unsigned char *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
unsigned int var;
|
||||||
|
int avg;
|
||||||
|
|
||||||
|
vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
|
||||||
|
*sse = var;
|
||||||
|
return (var - (((unsigned int)avg * avg) >> 4));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
|
||||||
|
const unsigned char *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
unsigned int var;
|
||||||
|
int avg;
|
||||||
|
|
||||||
|
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
|
||||||
|
*sse = var;
|
||||||
|
|
||||||
|
return (var - (((unsigned int)avg * avg) >> 6));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
|
||||||
|
const unsigned char *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
unsigned int sse0, sse1, sse2, sse3, var;
|
||||||
|
int sum0, sum1, sum2, sum3;
|
||||||
|
|
||||||
|
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
||||||
|
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
|
||||||
|
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
|
||||||
|
b + 8 * b_stride, b_stride, &sse2, &sum2);
|
||||||
|
vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
|
||||||
|
b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
|
||||||
|
|
||||||
|
var = sse0 + sse1 + sse2 + sse3;
|
||||||
|
*sse = var;
|
||||||
|
return var;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
|
||||||
|
const unsigned char *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
unsigned int sse0, sse1, sse2, sse3, var;
|
||||||
|
int sum0, sum1, sum2, sum3, avg;
|
||||||
|
|
||||||
|
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
||||||
|
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
|
||||||
|
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
|
||||||
|
b + 8 * b_stride, b_stride, &sse2, &sum2);
|
||||||
|
vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
|
||||||
|
b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
|
||||||
|
|
||||||
|
var = sse0 + sse1 + sse2 + sse3;
|
||||||
|
avg = sum0 + sum1 + sum2 + sum3;
|
||||||
|
*sse = var;
|
||||||
|
return (var - (((unsigned int)avg * avg) >> 8));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
|
||||||
|
const unsigned char *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
unsigned int sse0, sse1, var;
|
||||||
|
int sum0, sum1, avg;
|
||||||
|
|
||||||
|
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
||||||
|
vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
|
||||||
|
|
||||||
|
var = sse0 + sse1;
|
||||||
|
avg = sum0 + sum1;
|
||||||
|
*sse = var;
|
||||||
|
return (var - (((unsigned int)avg * avg) >> 7));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
|
||||||
|
const unsigned char *b, int b_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
unsigned int sse0, sse1, var;
|
||||||
|
int sum0, sum1, avg;
|
||||||
|
|
||||||
|
vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
|
||||||
|
vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
|
||||||
|
b + 8 * b_stride, b_stride, &sse1, &sum1);
|
||||||
|
|
||||||
|
var = sse0 + sse1;
|
||||||
|
avg = sum0 + sum1;
|
||||||
|
*sse = var;
|
||||||
|
|
||||||
|
return (var - (((unsigned int)avg * avg) >> 7));
|
||||||
|
}
|
309
vpx_dsp/x86/variance_sse2.c
Normal file
309
vpx_dsp/x86/variance_sse2.c
Normal file
@ -0,0 +1,309 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <emmintrin.h> // SSE2
|
||||||
|
|
||||||
|
#include "./vpx_config.h"
|
||||||
|
#include "./vpx_dsp_rtcd.h"
|
||||||
|
|
||||||
|
#include "vpx_ports/mem.h"
|
||||||
|
|
||||||
|
typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
|
||||||
|
const unsigned char *ref, int ref_stride,
|
||||||
|
unsigned int *sse, int *sum);
|
||||||
|
|
||||||
|
unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
|
||||||
|
__m128i vsum = _mm_setzero_si128();
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < 32; ++i) {
|
||||||
|
const __m128i v = _mm_loadu_si128((const __m128i *)src);
|
||||||
|
vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
|
||||||
|
src += 8;
|
||||||
|
}
|
||||||
|
|
||||||
|
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
||||||
|
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
||||||
|
return _mm_cvtsi128_si32(vsum);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define READ64(p, stride, i) \
|
||||||
|
_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
|
||||||
|
_mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
|
||||||
|
|
||||||
|
static void get4x4var_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse, int *sum) {
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
|
||||||
|
const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
|
||||||
|
const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
|
||||||
|
const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
|
||||||
|
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||||
|
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||||
|
|
||||||
|
// sum
|
||||||
|
__m128i vsum = _mm_add_epi16(diff0, diff1);
|
||||||
|
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||||
|
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||||
|
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
|
||||||
|
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
|
||||||
|
|
||||||
|
// sse
|
||||||
|
vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
|
||||||
|
_mm_madd_epi16(diff1, diff1));
|
||||||
|
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
|
||||||
|
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
|
||||||
|
*sse = _mm_cvtsi128_si32(vsum);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse, int *sum) {
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
__m128i vsum = _mm_setzero_si128();
|
||||||
|
__m128i vsse = _mm_setzero_si128();
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < 8; i += 2) {
|
||||||
|
const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||||
|
(const __m128i *)(src + i * src_stride)), zero);
|
||||||
|
const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||||
|
(const __m128i *)(ref + i * ref_stride)), zero);
|
||||||
|
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||||
|
|
||||||
|
const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||||
|
(const __m128i *)(src + (i + 1) * src_stride)), zero);
|
||||||
|
const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
|
||||||
|
(const __m128i *)(ref + (i + 1) * ref_stride)), zero);
|
||||||
|
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||||
|
|
||||||
|
vsum = _mm_add_epi16(vsum, diff0);
|
||||||
|
vsum = _mm_add_epi16(vsum, diff1);
|
||||||
|
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
||||||
|
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum
|
||||||
|
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||||
|
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||||
|
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
|
||||||
|
*sum = (int16_t)_mm_extract_epi16(vsum, 0);
|
||||||
|
|
||||||
|
// sse
|
||||||
|
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
||||||
|
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
||||||
|
*sse = _mm_cvtsi128_si32(vsse);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse, int *sum) {
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
__m128i vsum = _mm_setzero_si128();
|
||||||
|
__m128i vsse = _mm_setzero_si128();
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < 16; ++i) {
|
||||||
|
const __m128i s = _mm_loadu_si128((const __m128i *)src);
|
||||||
|
const __m128i r = _mm_loadu_si128((const __m128i *)ref);
|
||||||
|
|
||||||
|
const __m128i src0 = _mm_unpacklo_epi8(s, zero);
|
||||||
|
const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
|
||||||
|
const __m128i diff0 = _mm_sub_epi16(src0, ref0);
|
||||||
|
|
||||||
|
const __m128i src1 = _mm_unpackhi_epi8(s, zero);
|
||||||
|
const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
|
||||||
|
const __m128i diff1 = _mm_sub_epi16(src1, ref1);
|
||||||
|
|
||||||
|
vsum = _mm_add_epi16(vsum, diff0);
|
||||||
|
vsum = _mm_add_epi16(vsum, diff1);
|
||||||
|
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
|
||||||
|
vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
|
||||||
|
|
||||||
|
src += src_stride;
|
||||||
|
ref += ref_stride;
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum
|
||||||
|
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
|
||||||
|
vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
|
||||||
|
*sum = (int16_t)_mm_extract_epi16(vsum, 0) +
|
||||||
|
(int16_t)_mm_extract_epi16(vsum, 1);
|
||||||
|
|
||||||
|
// sse
|
||||||
|
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
|
||||||
|
vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
|
||||||
|
*sse = _mm_cvtsi128_si32(vsse);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void variance_sse2(const unsigned char *src, int src_stride,
|
||||||
|
const unsigned char *ref, int ref_stride,
|
||||||
|
int w, int h, unsigned int *sse, int *sum,
|
||||||
|
getNxMvar_fn_t var_fn, int block_size) {
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
*sse = 0;
|
||||||
|
*sum = 0;
|
||||||
|
|
||||||
|
for (i = 0; i < h; i += block_size) {
|
||||||
|
for (j = 0; j < w; j += block_size) {
|
||||||
|
unsigned int sse0;
|
||||||
|
int sum0;
|
||||||
|
var_fn(src + src_stride * i + j, src_stride,
|
||||||
|
ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
|
||||||
|
*sse += sse0;
|
||||||
|
*sum += sum0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
|
||||||
|
const unsigned char *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||||
|
return *sse - (((unsigned int)sum * sum) >> 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
|
||||||
|
sse, &sum, get4x4var_sse2, 4);
|
||||||
|
return *sse - (((unsigned int)sum * sum) >> 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
|
||||||
|
sse, &sum, get4x4var_sse2, 4);
|
||||||
|
return *sse - (((unsigned int)sum * sum) >> 5);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
|
||||||
|
const unsigned char *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||||
|
return *sse - (((unsigned int)sum * sum) >> 6);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
|
||||||
|
const unsigned char *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
|
||||||
|
sse, &sum, vpx_get8x8var_sse2, 8);
|
||||||
|
return *sse - (((unsigned int)sum * sum) >> 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
|
||||||
|
const unsigned char *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
|
||||||
|
sse, &sum, vpx_get8x8var_sse2, 8);
|
||||||
|
return *sse - (((unsigned int)sum * sum) >> 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
|
||||||
|
const unsigned char *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
|
||||||
|
return *sse - (((unsigned int)sum * sum) >> 8);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
|
||||||
|
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
|
||||||
|
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 9);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
|
||||||
|
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 9);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
|
||||||
|
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 12);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
|
||||||
|
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 11);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
int sum;
|
||||||
|
variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
|
||||||
|
sse, &sum, vpx_get16x16var_sse2, 16);
|
||||||
|
return *sse - (((int64_t)sum * sum) >> 11);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
|
||||||
|
return *sse;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
|
||||||
|
return *sse;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
|
||||||
|
return *sse;
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
|
||||||
|
const uint8_t *ref, int ref_stride,
|
||||||
|
unsigned int *sse) {
|
||||||
|
vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
|
||||||
|
return *sse;
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user