Merge "mips msa vpx_dsp variance optimization"
This commit is contained in:
commit
c96bb8004e
@ -2011,4 +2011,52 @@ INSTANTIATE_TEST_CASE_P(
|
||||
::testing::Values(make_tuple(4, 4, variance16x16_media, 0),
|
||||
make_tuple(3, 3, variance8x8_media, 0)));
|
||||
#endif // HAVE_MEDIA
|
||||
|
||||
#if HAVE_MSA
|
||||
INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,
|
||||
::testing::Values(vpx_get_mb_ss_msa));
|
||||
|
||||
const Get4x4SseFunc get4x4sse_cs_msa = vpx_get4x4sse_cs_msa;
|
||||
INSTANTIATE_TEST_CASE_P(MSA, VpxSseTest,
|
||||
::testing::Values(make_tuple(2, 2, get4x4sse_cs_msa)));
|
||||
|
||||
const VarianceMxNFunc mse16x16_msa = vpx_mse16x16_msa;
|
||||
const VarianceMxNFunc mse16x8_msa = vpx_mse16x8_msa;
|
||||
const VarianceMxNFunc mse8x16_msa = vpx_mse8x16_msa;
|
||||
const VarianceMxNFunc mse8x8_msa = vpx_mse8x8_msa;
|
||||
INSTANTIATE_TEST_CASE_P(MSA, VpxMseTest,
|
||||
::testing::Values(make_tuple(4, 4, mse16x16_msa),
|
||||
make_tuple(4, 3, mse16x8_msa),
|
||||
make_tuple(3, 4, mse8x16_msa),
|
||||
make_tuple(3, 3, mse8x8_msa)));
|
||||
|
||||
const VarianceMxNFunc variance64x64_msa = vpx_variance64x64_msa;
|
||||
const VarianceMxNFunc variance64x32_msa = vpx_variance64x32_msa;
|
||||
const VarianceMxNFunc variance32x64_msa = vpx_variance32x64_msa;
|
||||
const VarianceMxNFunc variance32x32_msa = vpx_variance32x32_msa;
|
||||
const VarianceMxNFunc variance32x16_msa = vpx_variance32x16_msa;
|
||||
const VarianceMxNFunc variance16x32_msa = vpx_variance16x32_msa;
|
||||
const VarianceMxNFunc variance16x16_msa = vpx_variance16x16_msa;
|
||||
const VarianceMxNFunc variance16x8_msa = vpx_variance16x8_msa;
|
||||
const VarianceMxNFunc variance8x16_msa = vpx_variance8x16_msa;
|
||||
const VarianceMxNFunc variance8x8_msa = vpx_variance8x8_msa;
|
||||
const VarianceMxNFunc variance8x4_msa = vpx_variance8x4_msa;
|
||||
const VarianceMxNFunc variance4x8_msa = vpx_variance4x8_msa;
|
||||
const VarianceMxNFunc variance4x4_msa = vpx_variance4x4_msa;
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MSA, VpxVarianceTest,
|
||||
::testing::Values(make_tuple(6, 6, variance64x64_msa, 0),
|
||||
make_tuple(6, 5, variance64x32_msa, 0),
|
||||
make_tuple(5, 6, variance32x64_msa, 0),
|
||||
make_tuple(5, 5, variance32x32_msa, 0),
|
||||
make_tuple(5, 4, variance32x16_msa, 0),
|
||||
make_tuple(4, 5, variance16x32_msa, 0),
|
||||
make_tuple(4, 4, variance16x16_msa, 0),
|
||||
make_tuple(4, 3, variance16x8_msa, 0),
|
||||
make_tuple(3, 4, variance8x16_msa, 0),
|
||||
make_tuple(3, 3, variance8x8_msa, 0),
|
||||
make_tuple(3, 2, variance8x4_msa, 0),
|
||||
make_tuple(2, 3, variance4x8_msa, 0),
|
||||
make_tuple(2, 2, variance4x4_msa, 0)));
|
||||
#endif // HAVE_MSA
|
||||
} // namespace
|
||||
|
244
vpx_dsp/mips/macros_msa.h
Normal file
244
vpx_dsp/mips/macros_msa.h
Normal file
@ -0,0 +1,244 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VPX_DSP_MIPS_MACROS_MSA_H_
|
||||
#define VPX_DSP_MIPS_MACROS_MSA_H_
|
||||
|
||||
#include <msa.h>
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
|
||||
#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
|
||||
#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
|
||||
#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
|
||||
|
||||
#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
|
||||
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
|
||||
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
|
||||
|
||||
#if (__mips_isa_rev >= 6)
|
||||
#define LW(psrc) ({ \
|
||||
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
|
||||
uint32_t val_m; \
|
||||
\
|
||||
__asm__ __volatile__ ( \
|
||||
"lw %[val_m], %[psrc_m] \n\t" \
|
||||
\
|
||||
: [val_m] "=r" (val_m) \
|
||||
: [psrc_m] "m" (*psrc_m) \
|
||||
); \
|
||||
\
|
||||
val_m; \
|
||||
})
|
||||
#else // !(__mips_isa_rev >= 6)
|
||||
#define LW(psrc) ({ \
|
||||
const uint8_t *psrc_m = (const uint8_t *)(psrc); \
|
||||
uint32_t val_m; \
|
||||
\
|
||||
__asm__ __volatile__ ( \
|
||||
"ulw %[val_m], %[psrc_m] \n\t" \
|
||||
\
|
||||
: [val_m] "=r" (val_m) \
|
||||
: [psrc_m] "m" (*psrc_m) \
|
||||
); \
|
||||
\
|
||||
val_m; \
|
||||
})
|
||||
#endif // (__mips_isa_rev >= 6)
|
||||
|
||||
/* Description : Load 4 words with stride
|
||||
Arguments : Inputs - psrc, stride
|
||||
Outputs - out0, out1, out2, out3
|
||||
Details : Load word in 'out0' from (psrc)
|
||||
Load word in 'out1' from (psrc + stride)
|
||||
Load word in 'out2' from (psrc + 2 * stride)
|
||||
Load word in 'out3' from (psrc + 3 * stride)
|
||||
*/
|
||||
#define LW4(psrc, stride, out0, out1, out2, out3) { \
|
||||
out0 = LW((psrc)); \
|
||||
out1 = LW((psrc) + stride); \
|
||||
out2 = LW((psrc) + 2 * stride); \
|
||||
out3 = LW((psrc) + 3 * stride); \
|
||||
}
|
||||
|
||||
/* Description : Load vectors with 16 byte elements with stride
|
||||
Arguments : Inputs - psrc, stride
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Load 16 byte elements in 'out0' from (psrc)
|
||||
Load 16 byte elements in 'out1' from (psrc + stride)
|
||||
*/
|
||||
#define LD_B2(RTYPE, psrc, stride, out0, out1) { \
|
||||
out0 = LD_B(RTYPE, (psrc)); \
|
||||
out1 = LD_B(RTYPE, (psrc) + stride); \
|
||||
}
|
||||
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
|
||||
|
||||
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
|
||||
LD_B2(RTYPE, (psrc), stride, out0, out1); \
|
||||
LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
|
||||
}
|
||||
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
|
||||
|
||||
/* Description : Load vectors with 8 halfword elements with stride
|
||||
Arguments : Inputs - psrc, stride
|
||||
Outputs - out0, out1
|
||||
Details : Load 8 halfword elements in 'out0' from (psrc)
|
||||
Load 8 halfword elements in 'out1' from (psrc + stride)
|
||||
*/
|
||||
#define LD_H2(RTYPE, psrc, stride, out0, out1) { \
|
||||
out0 = LD_H(RTYPE, (psrc)); \
|
||||
out1 = LD_H(RTYPE, (psrc) + (stride)); \
|
||||
}
|
||||
|
||||
#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) { \
|
||||
LD_H2(RTYPE, (psrc), stride, out0, out1); \
|
||||
LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
|
||||
}
|
||||
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Dot product & addition of halfword vector elements
|
||||
Arguments : Inputs - mult0, mult1, cnst0, cnst1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Signed halfword elements from 'mult0' are multiplied with
|
||||
signed halfword elements from 'cnst0' producing a result
|
||||
twice the size of input i.e. signed word.
|
||||
The multiplication result of adjacent odd-even elements
|
||||
are added to the 'out0' vector
|
||||
*/
|
||||
#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
|
||||
out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
|
||||
}
|
||||
#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Dot product & addition of double word vector elements
|
||||
Arguments : Inputs - mult0, mult1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Each signed word element from 'mult0' is multiplied with itself
|
||||
producing an intermediate result twice the size of it
|
||||
i.e. signed double word
|
||||
The multiplication result of adjacent odd-even elements
|
||||
are added to the 'out0' vector
|
||||
*/
|
||||
#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
|
||||
out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
|
||||
}
|
||||
#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
|
||||
|
||||
/* Description : Horizontal addition of 4 signed word elements of input vector
|
||||
Arguments : Input - in (signed word vector)
|
||||
Output - sum_m (i32 sum)
|
||||
Return Type - signed word (GP)
|
||||
Details : 4 signed word elements of 'in' vector are added together and
|
||||
the resulting integer sum is returned
|
||||
*/
|
||||
#define HADD_SW_S32(in) ({ \
|
||||
v2i64 res0_m, res1_m; \
|
||||
int32_t sum_m; \
|
||||
\
|
||||
res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
|
||||
res1_m = __msa_splati_d(res0_m, 1); \
|
||||
res0_m = res0_m + res1_m; \
|
||||
sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
|
||||
sum_m; \
|
||||
})
|
||||
|
||||
/* Description : Horizontal subtraction of unsigned byte vector elements
|
||||
Arguments : Inputs - in0, in1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Each unsigned odd byte element from 'in0' is subtracted from
|
||||
even unsigned byte element from 'in0' (pairwise) and the
|
||||
halfword result is written to 'out0'
|
||||
*/
|
||||
#define HSUB_UB2(RTYPE, in0, in1, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
|
||||
out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
|
||||
}
|
||||
#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
|
||||
|
||||
/* Description : Set element n input vector to GPR value
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Output - out
|
||||
Return Type - as per RTYPE
|
||||
Details : Set element 0 in vector 'out' to value specified in 'in0'
|
||||
*/
|
||||
#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \
|
||||
out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
|
||||
out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
|
||||
out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
|
||||
out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
|
||||
}
|
||||
#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
|
||||
#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave both left and right half of input vectors
|
||||
Arguments : Inputs - in0, in1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Right half of byte elements from 'in0' and 'in1' are
|
||||
interleaved and written to 'out0'
|
||||
*/
|
||||
#define ILVRL_B2(RTYPE, in0, in1, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
|
||||
out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
|
||||
}
|
||||
#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
|
||||
|
||||
#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
|
||||
out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
|
||||
}
|
||||
#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
|
||||
|
||||
/* Description : Pack even double word elements of vector pairs
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Even double elements of 'in0' are copied to the left half of
|
||||
'out0' & even double elements of 'in1' are copied to the right
|
||||
half of 'out0'.
|
||||
*/
|
||||
#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \
|
||||
out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
|
||||
out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
|
||||
}
|
||||
#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
|
||||
|
||||
#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3) { \
|
||||
PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
|
||||
PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
|
||||
|
||||
/* Description : Sign extend halfword elements from input vector and return
|
||||
the result in pair of vectors
|
||||
Arguments : Input - in (halfword vector)
|
||||
Outputs - out0, out1 (sign extended word vectors)
|
||||
Return Type - signed word
|
||||
Details : Sign bit of halfword elements from input vector 'in' is
|
||||
extracted and interleaved right with same vector 'in0' to
|
||||
generate 4 signed word elements in 'out0'
|
||||
Then interleaved left with same vector 'in0' to
|
||||
generate 4 signed word elements in 'out1'
|
||||
*/
|
||||
#define UNPCK_SH_SW(in, out0, out1) { \
|
||||
v8i16 tmp_m; \
|
||||
\
|
||||
tmp_m = __msa_clti_s_h((v8i16)in, 0); \
|
||||
ILVRL_H2_SW(tmp_m, in, out0, out1); \
|
||||
}
|
||||
#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
|
633
vpx_dsp/mips/variance_msa.c
Normal file
633
vpx_dsp/mips/variance_msa.c
Normal file
@ -0,0 +1,633 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx_dsp/mips/macros_msa.h"
|
||||
|
||||
#define CALC_MSE_B(src, ref, var) { \
|
||||
v16u8 src_l0_m, src_l1_m; \
|
||||
v8i16 res_l0_m, res_l1_m; \
|
||||
\
|
||||
ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
|
||||
HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
|
||||
DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
|
||||
}
|
||||
|
||||
#define CALC_MSE_AVG_B(src, ref, var, sub) { \
|
||||
v16u8 src_l0_m, src_l1_m; \
|
||||
v8i16 res_l0_m, res_l1_m; \
|
||||
\
|
||||
ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
|
||||
HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
|
||||
DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
|
||||
\
|
||||
sub += res_l0_m + res_l1_m; \
|
||||
}
|
||||
|
||||
#define VARIANCE_WxH(sse, diff, shift) \
|
||||
sse - (((uint32_t)diff * diff) >> shift)
|
||||
|
||||
#define VARIANCE_LARGE_WxH(sse, diff, shift) \
|
||||
sse - (((int64_t)diff * diff) >> shift)
|
||||
|
||||
static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height, int32_t *diff) {
|
||||
uint32_t src0, src1, src2, src3;
|
||||
uint32_t ref0, ref1, ref2, ref3;
|
||||
int32_t ht_cnt;
|
||||
v16u8 src = { 0 };
|
||||
v16u8 ref = { 0 };
|
||||
v8i16 avg = { 0 };
|
||||
v4i32 vec, var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LW4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||
src_ptr += (4 * src_stride);
|
||||
LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += (4 * ref_stride);
|
||||
|
||||
INSERT_W4_UB(src0, src1, src2, src3, src);
|
||||
INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
|
||||
CALC_MSE_AVG_B(src, ref, var, avg);
|
||||
}
|
||||
|
||||
vec = __msa_hadd_s_w(avg, avg);
|
||||
*diff = HADD_SW_S32(vec);
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height, int32_t *diff) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
v16u8 ref0, ref1, ref2, ref3;
|
||||
v8i16 avg = { 0 };
|
||||
v4i32 vec, var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||
src_ptr += (4 * src_stride);
|
||||
LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += (4 * ref_stride);
|
||||
|
||||
PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
|
||||
src0, src1, ref0, ref1);
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg);
|
||||
}
|
||||
|
||||
vec = __msa_hadd_s_w(avg, avg);
|
||||
*diff = HADD_SW_S32(vec);
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height, int32_t *diff) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src, ref;
|
||||
v8i16 avg = { 0 };
|
||||
v4i32 vec, var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src, ref, var, avg);
|
||||
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src, ref, var, avg);
|
||||
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src, ref, var, avg);
|
||||
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src, ref, var, avg);
|
||||
}
|
||||
|
||||
vec = __msa_hadd_s_w(avg, avg);
|
||||
*diff = HADD_SW_S32(vec);
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height, int32_t *diff) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, ref0, ref1;
|
||||
v8i16 avg = { 0 };
|
||||
v4i32 vec, var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg);
|
||||
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg);
|
||||
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg);
|
||||
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg);
|
||||
}
|
||||
|
||||
vec = __msa_hadd_s_w(avg, avg);
|
||||
*diff = HADD_SW_S32(vec);
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t *diff) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, ref0, ref1;
|
||||
v8i16 avg0 = { 0 };
|
||||
v8i16 avg1 = { 0 };
|
||||
v4i32 vec, var = { 0 };
|
||||
|
||||
for (ht_cnt = 16; ht_cnt--;) {
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg0);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg1);
|
||||
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg0);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg1);
|
||||
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg0);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg1);
|
||||
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg0);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg1);
|
||||
}
|
||||
|
||||
vec = __msa_hadd_s_w(avg0, avg0);
|
||||
vec += __msa_hadd_s_w(avg1, avg1);
|
||||
*diff = HADD_SW_S32(vec);
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t *diff) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
v16u8 ref0, ref1, ref2, ref3;
|
||||
v8i16 avg0 = { 0 };
|
||||
v8i16 avg1 = { 0 };
|
||||
v4i32 vec, var = { 0 };
|
||||
|
||||
for (ht_cnt = 16; ht_cnt--;) {
|
||||
LD_UB4(src_ptr, 16, src0, src1, src2, src3);
|
||||
src_ptr += src_stride;
|
||||
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg0);
|
||||
CALC_MSE_AVG_B(src2, ref2, var, avg0);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg1);
|
||||
CALC_MSE_AVG_B(src3, ref3, var, avg1);
|
||||
|
||||
LD_UB4(src_ptr, 16, src0, src1, src2, src3);
|
||||
src_ptr += src_stride;
|
||||
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg0);
|
||||
CALC_MSE_AVG_B(src2, ref2, var, avg0);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg1);
|
||||
CALC_MSE_AVG_B(src3, ref3, var, avg1);
|
||||
}
|
||||
|
||||
vec = __msa_hadd_s_w(avg0, avg0);
|
||||
vec += __msa_hadd_s_w(avg1, avg1);
|
||||
*diff = HADD_SW_S32(vec);
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t *diff) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
v16u8 ref0, ref1, ref2, ref3;
|
||||
v8i16 avg0 = { 0 };
|
||||
v8i16 avg1 = { 0 };
|
||||
v8i16 avg2 = { 0 };
|
||||
v8i16 avg3 = { 0 };
|
||||
v4i32 vec, var = { 0 };
|
||||
|
||||
for (ht_cnt = 32; ht_cnt--;) {
|
||||
LD_UB4(src_ptr, 16, src0, src1, src2, src3);
|
||||
src_ptr += src_stride;
|
||||
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += ref_stride;
|
||||
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg0);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg1);
|
||||
CALC_MSE_AVG_B(src2, ref2, var, avg2);
|
||||
CALC_MSE_AVG_B(src3, ref3, var, avg3);
|
||||
LD_UB4(src_ptr, 16, src0, src1, src2, src3);
|
||||
src_ptr += src_stride;
|
||||
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_AVG_B(src0, ref0, var, avg0);
|
||||
CALC_MSE_AVG_B(src1, ref1, var, avg1);
|
||||
CALC_MSE_AVG_B(src2, ref2, var, avg2);
|
||||
CALC_MSE_AVG_B(src3, ref3, var, avg3);
|
||||
}
|
||||
|
||||
vec = __msa_hadd_s_w(avg0, avg0);
|
||||
vec += __msa_hadd_s_w(avg1, avg1);
|
||||
vec += __msa_hadd_s_w(avg2, avg2);
|
||||
vec += __msa_hadd_s_w(avg3, avg3);
|
||||
*diff = HADD_SW_S32(vec);
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t get_mb_ss_msa(const int16_t *src) {
|
||||
uint32_t sum, cnt;
|
||||
v8i16 src0, src1, src2, src3;
|
||||
v4i32 src0_l, src1_l, src2_l, src3_l;
|
||||
v4i32 src0_r, src1_r, src2_r, src3_r;
|
||||
v2i64 sq_src_l = { 0 };
|
||||
v2i64 sq_src_r = { 0 };
|
||||
|
||||
for (cnt = 8; cnt--;) {
|
||||
LD_SH4(src, 8, src0, src1, src2, src3);
|
||||
src += 4 * 8;
|
||||
|
||||
UNPCK_SH_SW(src0, src0_l, src0_r);
|
||||
UNPCK_SH_SW(src1, src1_l, src1_r);
|
||||
UNPCK_SH_SW(src2, src2_l, src2_r);
|
||||
UNPCK_SH_SW(src3, src3_l, src3_r);
|
||||
|
||||
DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
|
||||
DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
|
||||
DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
|
||||
DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
|
||||
}
|
||||
|
||||
sq_src_l += __msa_splati_d(sq_src_l, 1);
|
||||
sq_src_r += __msa_splati_d(sq_src_r, 1);
|
||||
|
||||
sum = __msa_copy_s_d(sq_src_l, 0);
|
||||
sum += __msa_copy_s_d(sq_src_r, 0);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height) {
|
||||
int32_t ht_cnt;
|
||||
uint32_t src0, src1, src2, src3;
|
||||
uint32_t ref0, ref1, ref2, ref3;
|
||||
v16u8 src = { 0 };
|
||||
v16u8 ref = { 0 };
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LW4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||
src_ptr += (4 * src_stride);
|
||||
LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += (4 * ref_stride);
|
||||
|
||||
INSERT_W4_UB(src0, src1, src2, src3, src);
|
||||
INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
|
||||
CALC_MSE_B(src, ref, var);
|
||||
}
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
v16u8 ref0, ref1, ref2, ref3;
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||
src_ptr += (4 * src_stride);
|
||||
LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += (4 * ref_stride);
|
||||
|
||||
PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
|
||||
src0, src1, ref0, ref1);
|
||||
CALC_MSE_B(src0, ref0, var);
|
||||
CALC_MSE_B(src1, ref1, var);
|
||||
}
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src, ref;
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src, ref, var);
|
||||
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src, ref, var);
|
||||
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src, ref, var);
|
||||
|
||||
src = LD_UB(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
ref = LD_UB(ref_ptr);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src, ref, var);
|
||||
}
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, ref0, ref1;
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = (height >> 2); ht_cnt--;) {
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src0, ref0, var);
|
||||
CALC_MSE_B(src1, ref1, var);
|
||||
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src0, ref0, var);
|
||||
CALC_MSE_B(src1, ref1, var);
|
||||
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src0, ref0, var);
|
||||
CALC_MSE_B(src1, ref1, var);
|
||||
|
||||
LD_UB2(src_ptr, 16, src0, src1);
|
||||
src_ptr += src_stride;
|
||||
LD_UB2(ref_ptr, 16, ref0, ref1);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src0, ref0, var);
|
||||
CALC_MSE_B(src1, ref1, var);
|
||||
}
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride,
|
||||
int32_t height) {
|
||||
int32_t ht_cnt;
|
||||
v16u8 src0, src1, src2, src3;
|
||||
v16u8 ref0, ref1, ref2, ref3;
|
||||
v4i32 var = { 0 };
|
||||
|
||||
for (ht_cnt = height >> 1; ht_cnt--;) {
|
||||
LD_UB4(src_ptr, 16, src0, src1, src2, src3);
|
||||
src_ptr += src_stride;
|
||||
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src0, ref0, var);
|
||||
CALC_MSE_B(src2, ref2, var);
|
||||
CALC_MSE_B(src1, ref1, var);
|
||||
CALC_MSE_B(src3, ref3, var);
|
||||
|
||||
LD_UB4(src_ptr, 16, src0, src1, src2, src3);
|
||||
src_ptr += src_stride;
|
||||
LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
|
||||
ref_ptr += ref_stride;
|
||||
CALC_MSE_B(src0, ref0, var);
|
||||
CALC_MSE_B(src2, ref2, var);
|
||||
CALC_MSE_B(src1, ref1, var);
|
||||
CALC_MSE_B(src3, ref3, var);
|
||||
}
|
||||
|
||||
return HADD_SW_S32(var);
|
||||
}
|
||||
|
||||
uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
|
||||
const uint8_t *ref_ptr, int32_t ref_stride) {
|
||||
uint32_t err = 0;
|
||||
uint32_t src0, src1, src2, src3;
|
||||
uint32_t ref0, ref1, ref2, ref3;
|
||||
v16i8 src = { 0 };
|
||||
v16i8 ref = { 0 };
|
||||
v16u8 src_vec0, src_vec1;
|
||||
v8i16 diff0, diff1;
|
||||
v4i32 err0 = { 0 };
|
||||
v4i32 err1 = { 0 };
|
||||
|
||||
LW4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||
LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
|
||||
INSERT_W4_SB(src0, src1, src2, src3, src);
|
||||
INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
|
||||
ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
|
||||
HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
|
||||
DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
|
||||
err = HADD_SW_S32(err0);
|
||||
err += HADD_SW_S32(err1);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
|
||||
#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
|
||||
#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
|
||||
#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
|
||||
#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
|
||||
#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
|
||||
#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
|
||||
|
||||
#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
|
||||
#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
|
||||
#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
|
||||
#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
|
||||
#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
|
||||
#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
|
||||
|
||||
#define VPX_VARIANCE_WDXHT_MSA(wd, ht) \
|
||||
uint32_t vpx_variance##wd##x##ht##_msa(const uint8_t *src, \
|
||||
int32_t src_stride, \
|
||||
const uint8_t *ref, \
|
||||
int32_t ref_stride, \
|
||||
uint32_t *sse) { \
|
||||
int32_t diff; \
|
||||
\
|
||||
*sse = sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, \
|
||||
ht, &diff); \
|
||||
\
|
||||
return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
|
||||
}
|
||||
|
||||
VPX_VARIANCE_WDXHT_MSA(4, 4);
|
||||
VPX_VARIANCE_WDXHT_MSA(4, 8);
|
||||
|
||||
VPX_VARIANCE_WDXHT_MSA(8, 4)
|
||||
VPX_VARIANCE_WDXHT_MSA(8, 8)
|
||||
VPX_VARIANCE_WDXHT_MSA(8, 16)
|
||||
|
||||
VPX_VARIANCE_WDXHT_MSA(16, 8)
|
||||
VPX_VARIANCE_WDXHT_MSA(16, 16)
|
||||
VPX_VARIANCE_WDXHT_MSA(16, 32)
|
||||
|
||||
VPX_VARIANCE_WDXHT_MSA(32, 16)
|
||||
VPX_VARIANCE_WDXHT_MSA(32, 32)
|
||||
|
||||
uint32_t vpx_variance32x64_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
uint32_t *sse) {
|
||||
int32_t diff;
|
||||
|
||||
*sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
|
||||
|
||||
return VARIANCE_32Wx64H(*sse, diff);
|
||||
}
|
||||
|
||||
uint32_t vpx_variance64x32_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
uint32_t *sse) {
|
||||
int32_t diff;
|
||||
|
||||
*sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
|
||||
|
||||
return VARIANCE_64Wx32H(*sse, diff);
|
||||
}
|
||||
|
||||
uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
uint32_t *sse) {
|
||||
int32_t diff;
|
||||
|
||||
*sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
|
||||
|
||||
return VARIANCE_64Wx64H(*sse, diff);
|
||||
}
|
||||
|
||||
uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
uint32_t *sse) {
|
||||
*sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
|
||||
|
||||
return *sse;
|
||||
}
|
||||
|
||||
uint32_t vpx_mse8x16_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
uint32_t *sse) {
|
||||
*sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
|
||||
|
||||
return *sse;
|
||||
}
|
||||
|
||||
uint32_t vpx_mse16x8_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
uint32_t *sse) {
|
||||
*sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
|
||||
|
||||
return *sse;
|
||||
}
|
||||
|
||||
uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
uint32_t *sse) {
|
||||
*sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
|
||||
|
||||
return *sse;
|
||||
}
|
||||
|
||||
void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
uint32_t *sse, int32_t *sum) {
|
||||
*sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
|
||||
}
|
||||
|
||||
void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
|
||||
const uint8_t *ref, int32_t ref_stride,
|
||||
uint32_t *sse, int32_t *sum) {
|
||||
*sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
|
||||
}
|
||||
|
||||
uint32_t vpx_get_mb_ss_msa(const int16_t *src) {
|
||||
return get_mb_ss_msa(src);
|
||||
}
|
@ -27,6 +27,8 @@ DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/sad4d_avx2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/sad_avx2.c
|
||||
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
|
||||
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
|
||||
@ -46,6 +48,8 @@ DSP_SRCS-$(HAVE_SSE2) += x86/variance_sse2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/variance_avx2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/variance_impl_avx2.c
|
||||
|
||||
DSP_SRCS-$(HAVE_MSA) += mips/variance_msa.c
|
||||
|
||||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
|
||||
|
@ -395,68 +395,68 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
|
||||
|
||||
add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance64x64 sse2 avx2 neon/;
|
||||
specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance64x32 sse2 avx2 neon/;
|
||||
specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance32x64 sse2 neon/;
|
||||
specialize qw/vpx_variance32x64 sse2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance32x32 sse2 avx2 neon/;
|
||||
specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance32x16 sse2 avx2/;
|
||||
specialize qw/vpx_variance32x16 sse2 avx2 msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance16x32 sse2/;
|
||||
specialize qw/vpx_variance16x32 sse2 msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/;
|
||||
specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance16x8 mmx sse2 neon/;
|
||||
specialize qw/vpx_variance16x8 mmx sse2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance8x16 mmx sse2 neon/;
|
||||
specialize qw/vpx_variance8x16 mmx sse2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance8x8 mmx sse2 media neon/;
|
||||
specialize qw/vpx_variance8x8 mmx sse2 media neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance8x4 sse2/;
|
||||
specialize qw/vpx_variance8x4 sse2 msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance4x8 sse2/;
|
||||
specialize qw/vpx_variance4x8 sse2 msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance4x4 mmx sse2/;
|
||||
specialize qw/vpx_variance4x4 mmx sse2 msa/;
|
||||
|
||||
|
||||
add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vpx_get16x16var sse2 avx2 neon/;
|
||||
specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
|
||||
|
||||
add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
|
||||
specialize qw/vpx_get8x8var mmx sse2 neon/;
|
||||
specialize qw/vpx_get8x8var mmx sse2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/;
|
||||
specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse16x8 sse2/;
|
||||
specialize qw/vpx_mse16x8 sse2 msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse8x16 sse2/;
|
||||
specialize qw/vpx_mse8x16 sse2 msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse8x8 sse2/;
|
||||
specialize qw/vpx_mse8x8 sse2 msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
|
||||
specialize qw/vpx_get_mb_ss mmx sse2/;
|
||||
specialize qw/vpx_get_mb_ss mmx sse2 msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
|
||||
specialize qw/vpx_get4x4sse_cs neon/;
|
||||
specialize qw/vpx_get4x4sse_cs neon msa/;
|
||||
|
||||
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user