mips msa vp8 copy mem optimization
average improvement ~2x-4x Change-Id: I3af3ecced96c5b8e0cb811256e5089e28fe013a2
This commit is contained in:
parent
33a9d53c10
commit
509fb0bc9d
70
vp8/common/mips/msa/copymem_msa.c
Normal file
70
vp8/common/mips/msa/copymem_msa.c
Normal file
@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Copyright (c) 2015 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vp8_rtcd.h"
|
||||
#include "vp8/common/mips/msa/vp8_macros_msa.h"
|
||||
|
||||
static void copy_8x4_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *dst, int32_t dst_stride)
|
||||
{
|
||||
uint64_t src0, src1, src2, src3;
|
||||
|
||||
LD4(src, src_stride, src0, src1, src2, src3);
|
||||
SD4(src0, src1, src2, src3, dst, dst_stride);
|
||||
}
|
||||
|
||||
static void copy_8x8_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *dst, int32_t dst_stride)
|
||||
{
|
||||
uint64_t src0, src1, src2, src3;
|
||||
|
||||
LD4(src, src_stride, src0, src1, src2, src3);
|
||||
src += (4 * src_stride);
|
||||
SD4(src0, src1, src2, src3, dst, dst_stride);
|
||||
dst += (4 * dst_stride);
|
||||
|
||||
LD4(src, src_stride, src0, src1, src2, src3);
|
||||
SD4(src0, src1, src2, src3, dst, dst_stride);
|
||||
}
|
||||
|
||||
static void copy_16x16_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *dst, int32_t dst_stride)
|
||||
{
|
||||
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
|
||||
LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
src += (8 * src_stride);
|
||||
LD_UB8(src, src_stride, src8, src9, src10, src11, src12, src13, src14,
|
||||
src15);
|
||||
|
||||
ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
|
||||
dst += (8 * dst_stride);
|
||||
ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst,
|
||||
dst_stride);
|
||||
}
|
||||
|
||||
void vp8_copy_mem16x16_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *dst, int32_t dst_stride)
|
||||
{
|
||||
copy_16x16_msa(src, src_stride, dst, dst_stride);
|
||||
}
|
||||
|
||||
void vp8_copy_mem8x8_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *dst, int32_t dst_stride)
|
||||
{
|
||||
copy_8x8_msa(src, src_stride, dst, dst_stride);
|
||||
}
|
||||
|
||||
void vp8_copy_mem8x4_msa(uint8_t *src, int32_t src_stride,
|
||||
uint8_t *dst, int32_t dst_stride)
|
||||
{
|
||||
copy_8x4_msa(src, src_stride, dst, dst_stride);
|
||||
}
|
@ -222,6 +222,23 @@
|
||||
out3 = LW((psrc) + 3 * stride); \
|
||||
}
|
||||
|
||||
/* Description : Load double words with stride
|
||||
Arguments : Inputs - psrc, stride
|
||||
Outputs - out0, out1
|
||||
Details : Load double word in 'out0' from (psrc)
|
||||
Load double word in 'out1' from (psrc + stride)
|
||||
*/
|
||||
#define LD2(psrc, stride, out0, out1) \
|
||||
{ \
|
||||
out0 = LD((psrc)); \
|
||||
out1 = LD((psrc) + stride); \
|
||||
}
|
||||
#define LD4(psrc, stride, out0, out1, out2, out3) \
|
||||
{ \
|
||||
LD2((psrc), stride, out0, out1); \
|
||||
LD2((psrc) + 2 * stride, stride, out2, out3); \
|
||||
}
|
||||
|
||||
/* Description : Store 4 words with stride
|
||||
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
|
||||
Details : Store word from 'in0' to (pdst)
|
||||
@ -298,6 +315,7 @@
|
||||
LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
|
||||
}
|
||||
#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
|
||||
#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
|
||||
|
||||
/* Description : Load vectors with 8 halfword elements with stride
|
||||
Arguments : Inputs - psrc, stride
|
||||
@ -339,6 +357,14 @@
|
||||
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
|
||||
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
|
||||
|
||||
#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
pdst, stride) \
|
||||
{ \
|
||||
ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
|
||||
ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
|
||||
}
|
||||
#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
|
||||
|
||||
/* Description : Store vectors of 8 halfword elements with stride
|
||||
Arguments : Inputs - in0, in1, pdst, stride
|
||||
Details : Store 8 halfword elements from 'in0' to (pdst)
|
||||
|
@ -138,17 +138,17 @@ $vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
|
||||
# RECON
|
||||
#
|
||||
add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2/;
|
||||
specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2 msa/;
|
||||
$vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6;
|
||||
$vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2;
|
||||
|
||||
add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_copy_mem8x8 mmx media neon dspr2/;
|
||||
specialize qw/vp8_copy_mem8x8 mmx media neon dspr2 msa/;
|
||||
$vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6;
|
||||
$vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2;
|
||||
|
||||
add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_copy_mem8x4 mmx media neon dspr2/;
|
||||
specialize qw/vp8_copy_mem8x4 mmx media neon dspr2 msa/;
|
||||
$vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
|
||||
$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
|
||||
|
||||
|
@ -114,6 +114,7 @@ VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/idct_blk_dspr2.c
|
||||
VP8_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/dequantize_dspr2.c
|
||||
|
||||
# common (c)
|
||||
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/copymem_msa.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
|
||||
|
Loading…
Reference in New Issue
Block a user