Merge "mips msa vp8 filter by weight optimization"
This commit is contained in:
146
vp8/common/mips/msa/mfqe_msa.c
Normal file
146
vp8/common/mips/msa/mfqe_msa.c
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "./vp8_rtcd.h"
|
||||||
|
#include "vp8/common/postproc.h"
|
||||||
|
#include "vp8/common/mips/msa/vp8_macros_msa.h"
|
||||||
|
|
||||||
|
static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
|
||||||
|
uint8_t *dst_ptr, int32_t dst_stride,
|
||||||
|
int32_t src_weight)
|
||||||
|
{
|
||||||
|
int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
|
||||||
|
int32_t row;
|
||||||
|
uint64_t src0_d, src1_d, dst0_d, dst1_d;
|
||||||
|
v16i8 src0 = { 0 };
|
||||||
|
v16i8 src1 = { 0 };
|
||||||
|
v16i8 dst0 = { 0 };
|
||||||
|
v16i8 dst1 = { 0 };
|
||||||
|
v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;
|
||||||
|
|
||||||
|
src_wt = __msa_fill_h(src_weight);
|
||||||
|
dst_wt = __msa_fill_h(dst_weight);
|
||||||
|
|
||||||
|
for (row = 2; row--;)
|
||||||
|
{
|
||||||
|
LD2(src_ptr, src_stride, src0_d, src1_d);
|
||||||
|
src_ptr += (2 * src_stride);
|
||||||
|
LD2(dst_ptr, dst_stride, dst0_d, dst1_d);
|
||||||
|
INSERT_D2_SB(src0_d, src1_d, src0);
|
||||||
|
INSERT_D2_SB(dst0_d, dst1_d, dst0);
|
||||||
|
|
||||||
|
LD2(src_ptr, src_stride, src0_d, src1_d);
|
||||||
|
src_ptr += (2 * src_stride);
|
||||||
|
LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);
|
||||||
|
INSERT_D2_SB(src0_d, src1_d, src1);
|
||||||
|
INSERT_D2_SB(dst0_d, dst1_d, dst1);
|
||||||
|
|
||||||
|
UNPCK_UB_SH(src0, src_r, src_l);
|
||||||
|
UNPCK_UB_SH(dst0, dst_r, dst_l);
|
||||||
|
res_h_r = (src_r * src_wt);
|
||||||
|
res_h_r += (dst_r * dst_wt);
|
||||||
|
res_h_l = (src_l * src_wt);
|
||||||
|
res_h_l += (dst_l * dst_wt);
|
||||||
|
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
|
||||||
|
dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
|
||||||
|
ST8x2_UB(dst0, dst_ptr, dst_stride);
|
||||||
|
dst_ptr += (2 * dst_stride);
|
||||||
|
|
||||||
|
UNPCK_UB_SH(src1, src_r, src_l);
|
||||||
|
UNPCK_UB_SH(dst1, dst_r, dst_l);
|
||||||
|
res_h_r = (src_r * src_wt);
|
||||||
|
res_h_r += (dst_r * dst_wt);
|
||||||
|
res_h_l = (src_l * src_wt);
|
||||||
|
res_h_l += (dst_l * dst_wt);
|
||||||
|
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
|
||||||
|
dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);
|
||||||
|
ST8x2_UB(dst1, dst_ptr, dst_stride);
|
||||||
|
dst_ptr += (2 * dst_stride);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
|
||||||
|
uint8_t *dst_ptr, int32_t dst_stride,
|
||||||
|
int32_t src_weight)
|
||||||
|
{
|
||||||
|
int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;
|
||||||
|
int32_t row;
|
||||||
|
v16i8 src0, src1, src2, src3;
|
||||||
|
v16i8 dst0, dst1, dst2, dst3;
|
||||||
|
v8i16 src_wt, dst_wt;
|
||||||
|
v8i16 res_h_r, res_h_l;
|
||||||
|
v8i16 src_r, src_l, dst_r, dst_l;
|
||||||
|
|
||||||
|
src_wt = __msa_fill_h(src_weight);
|
||||||
|
dst_wt = __msa_fill_h(dst_weight);
|
||||||
|
|
||||||
|
for (row = 4; row--;)
|
||||||
|
{
|
||||||
|
LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);
|
||||||
|
src_ptr += (4 * src_stride);
|
||||||
|
LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);
|
||||||
|
|
||||||
|
UNPCK_UB_SH(src0, src_r, src_l);
|
||||||
|
UNPCK_UB_SH(dst0, dst_r, dst_l);
|
||||||
|
res_h_r = (src_r * src_wt);
|
||||||
|
res_h_r += (dst_r * dst_wt);
|
||||||
|
res_h_l = (src_l * src_wt);
|
||||||
|
res_h_l += (dst_l * dst_wt);
|
||||||
|
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
|
||||||
|
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
|
||||||
|
dst_ptr += dst_stride;
|
||||||
|
|
||||||
|
UNPCK_UB_SH(src1, src_r, src_l);
|
||||||
|
UNPCK_UB_SH(dst1, dst_r, dst_l);
|
||||||
|
res_h_r = (src_r * src_wt);
|
||||||
|
res_h_r += (dst_r * dst_wt);
|
||||||
|
res_h_l = (src_l * src_wt);
|
||||||
|
res_h_l += (dst_l * dst_wt);
|
||||||
|
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
|
||||||
|
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
|
||||||
|
dst_ptr += dst_stride;
|
||||||
|
|
||||||
|
UNPCK_UB_SH(src2, src_r, src_l);
|
||||||
|
UNPCK_UB_SH(dst2, dst_r, dst_l);
|
||||||
|
res_h_r = (src_r * src_wt);
|
||||||
|
res_h_r += (dst_r * dst_wt);
|
||||||
|
res_h_l = (src_l * src_wt);
|
||||||
|
res_h_l += (dst_l * dst_wt);
|
||||||
|
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
|
||||||
|
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
|
||||||
|
dst_ptr += dst_stride;
|
||||||
|
|
||||||
|
UNPCK_UB_SH(src3, src_r, src_l);
|
||||||
|
UNPCK_UB_SH(dst3, dst_r, dst_l);
|
||||||
|
res_h_r = (src_r * src_wt);
|
||||||
|
res_h_r += (dst_r * dst_wt);
|
||||||
|
res_h_l = (src_l * src_wt);
|
||||||
|
res_h_l += (dst_l * dst_wt);
|
||||||
|
SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);
|
||||||
|
PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);
|
||||||
|
dst_ptr += dst_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,
|
||||||
|
uint8_t *dst_ptr, int32_t dst_stride,
|
||||||
|
int32_t src_weight)
|
||||||
|
{
|
||||||
|
filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride,
|
||||||
|
src_weight);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,
|
||||||
|
uint8_t *dst_ptr, int32_t dst_stride,
|
||||||
|
int32_t src_weight)
|
||||||
|
{
|
||||||
|
filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride,
|
||||||
|
src_weight);
|
||||||
|
}
|
@@ -435,6 +435,25 @@
|
|||||||
ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
|
ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Description : Store 8x2 byte block to destination memory from input vector
|
||||||
|
Arguments : Inputs - in, pdst, stride
|
||||||
|
Details : Index 0 double word element from 'in' vector is copied to the
|
||||||
|
GP register and stored to (pdst)
|
||||||
|
Index 1 double word element from 'in' vector is copied to the
|
||||||
|
GP register and stored to (pdst + stride)
|
||||||
|
*/
|
||||||
|
#define ST8x2_UB(in, pdst, stride) \
|
||||||
|
{ \
|
||||||
|
uint64_t out0_m, out1_m; \
|
||||||
|
uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
|
||||||
|
\
|
||||||
|
out0_m = __msa_copy_u_d((v2i64)in, 0); \
|
||||||
|
out1_m = __msa_copy_u_d((v2i64)in, 1); \
|
||||||
|
\
|
||||||
|
SD(out0_m, pblk_8x2_m); \
|
||||||
|
SD(out1_m, pblk_8x2_m + stride); \
|
||||||
|
}
|
||||||
|
|
||||||
/* Description : Store 8x4 byte block to destination memory from input
|
/* Description : Store 8x4 byte block to destination memory from input
|
||||||
vectors
|
vectors
|
||||||
Arguments : Inputs - in0, in1, pdst, stride
|
Arguments : Inputs - in0, in1, pdst, stride
|
||||||
@@ -623,6 +642,19 @@
|
|||||||
out_m; \
|
out_m; \
|
||||||
})
|
})
|
||||||
|
|
||||||
|
/* Description : Set element n input vector to GPR value
|
||||||
|
Arguments : Inputs - in0, in1, in2, in3
|
||||||
|
Output - out
|
||||||
|
Return Type - as per RTYPE
|
||||||
|
Details : Set element 0 in vector 'out' to value specified in 'in0'
|
||||||
|
*/
|
||||||
|
#define INSERT_D2(RTYPE, in0, in1, out) \
|
||||||
|
{ \
|
||||||
|
out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
|
||||||
|
out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
|
||||||
|
}
|
||||||
|
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
|
||||||
|
|
||||||
/* Description : Interleave even byte elements from vectors
|
/* Description : Interleave even byte elements from vectors
|
||||||
Arguments : Inputs - in0, in1, in2, in3
|
Arguments : Inputs - in0, in1, in2, in3
|
||||||
Outputs - out0, out1
|
Outputs - out0, out1
|
||||||
@@ -1116,6 +1148,20 @@
|
|||||||
ADD2(in4, in5, in6, in7, out2, out3); \
|
ADD2(in4, in5, in6, in7, out2, out3); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Description : Zero extend unsigned byte elements to halfword elements
|
||||||
|
Arguments : Input - in (unsigned byte vector)
|
||||||
|
Outputs - out0, out1 (unsigned halfword vectors)
|
||||||
|
Return Type - signed halfword
|
||||||
|
Details : Zero extended right half of vector is returned in 'out0'
|
||||||
|
Zero extended left half of vector is returned in 'out1'
|
||||||
|
*/
|
||||||
|
#define UNPCK_UB_SH(in, out0, out1) \
|
||||||
|
{ \
|
||||||
|
v16i8 zero_m = { 0 }; \
|
||||||
|
\
|
||||||
|
ILVRL_B2_SH(zero_m, in, out0, out1); \
|
||||||
|
}
|
||||||
|
|
||||||
/* Description : Sign extend halfword elements from input vector and return
|
/* Description : Sign extend halfword elements from input vector and return
|
||||||
the result in pair of vectors
|
the result in pair of vectors
|
||||||
Arguments : Input - in (halfword vector)
|
Arguments : Input - in (halfword vector)
|
||||||
|
@@ -191,10 +191,10 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
|
|||||||
# no asm yet
|
# no asm yet
|
||||||
|
|
||||||
add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
|
add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
|
||||||
specialize qw/vp8_filter_by_weight16x16 sse2/;
|
specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
|
||||||
|
|
||||||
add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
|
add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
|
||||||
specialize qw/vp8_filter_by_weight8x8 sse2/;
|
specialize qw/vp8_filter_by_weight8x8 sse2 msa/;
|
||||||
|
|
||||||
add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
|
add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
|
||||||
# no asm yet
|
# no asm yet
|
||||||
|
@@ -122,6 +122,10 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/reconintra_msa.c
|
|||||||
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
|
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
|
||||||
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
|
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
|
||||||
|
|
||||||
|
ifeq ($(CONFIG_POSTPROC),yes)
|
||||||
|
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
|
||||||
|
endif
|
||||||
|
|
||||||
# common (c)
|
# common (c)
|
||||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
|
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
|
||||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
|
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
|
||||||
|
Reference in New Issue
Block a user