vp8: [loongson] optimize sixtap predict with mmi
1. vp8_sixtap_predict16x16_mmi 2. vp8_sixtap_predict8x8_mmi 3. vp8_sixtap_predict8x4_mmi 4. vp8_sixtap_predict4x4_mmi Change-Id: I186669d1a1d998a0f3ba3a548e25eee8b52c251b
This commit is contained in:
		| @@ -324,6 +324,15 @@ INSTANTIATE_TEST_CASE_P( | |||||||
|                       make_tuple(4, 4, &vp8_sixtap_predict4x4_msa))); |                       make_tuple(4, 4, &vp8_sixtap_predict4x4_msa))); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | #if HAVE_MMI | ||||||
|  | INSTANTIATE_TEST_CASE_P( | ||||||
|  |     MMI, SixtapPredictTest, | ||||||
|  |     ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi), | ||||||
|  |                       make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi), | ||||||
|  |                       make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi), | ||||||
|  |                       make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi))); | ||||||
|  | #endif | ||||||
|  |  | ||||||
| class BilinearPredictTest : public PredictTestBase {}; | class BilinearPredictTest : public PredictTestBase {}; | ||||||
|  |  | ||||||
| TEST_P(BilinearPredictTest, TestWithRandomData) { | TEST_P(BilinearPredictTest, TestWithRandomData) { | ||||||
|   | |||||||
							
								
								
									
										398
									
								
								vp8/common/mips/mmi/sixtap_filter_mmi.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										398
									
								
								vp8/common/mips/mmi/sixtap_filter_mmi.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,398 @@ | |||||||
|  | /* | ||||||
|  |  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved. | ||||||
|  |  * | ||||||
|  |  *  Use of this source code is governed by a BSD-style license | ||||||
|  |  *  that can be found in the LICENSE file in the root of the source | ||||||
|  |  *  tree. An additional intellectual property rights grant can be found | ||||||
|  |  *  in the file PATENTS.  All contributing project authors may | ||||||
|  |  *  be found in the AUTHORS file in the root of the source tree. | ||||||
|  |  */ | ||||||
|  |  | ||||||
|  | #include "vp8/common/filter.h" | ||||||
|  | #include "vpx_ports/asmdefs_mmi.h" | ||||||
|  |  | ||||||
|  | DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = { | ||||||
|  |   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | ||||||
|  |     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | ||||||
|  |     0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, | ||||||
|  |     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | ||||||
|  |     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | ||||||
|  |     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, | ||||||
|  |   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | ||||||
|  |     0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, | ||||||
|  |     0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, | ||||||
|  |     0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, | ||||||
|  |     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, | ||||||
|  |     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, | ||||||
|  |   { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, | ||||||
|  |     0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, | ||||||
|  |     0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, | ||||||
|  |     0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, | ||||||
|  |     0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, | ||||||
|  |     0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 }, | ||||||
|  |   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | ||||||
|  |     0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, | ||||||
|  |     0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, | ||||||
|  |     0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, | ||||||
|  |     0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, | ||||||
|  |     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, | ||||||
|  |   { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, | ||||||
|  |     0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, | ||||||
|  |     0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, | ||||||
|  |     0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, | ||||||
|  |     0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, | ||||||
|  |     0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 }, | ||||||
|  |   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | ||||||
|  |     0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, | ||||||
|  |     0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, | ||||||
|  |     0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, | ||||||
|  |     0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, | ||||||
|  |     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, | ||||||
|  |   { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, | ||||||
|  |     0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, | ||||||
|  |     0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, | ||||||
|  |     0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, | ||||||
|  |     0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, | ||||||
|  |     0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 }, | ||||||
|  |   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, | ||||||
|  |     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, | ||||||
|  |     0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, | ||||||
|  |     0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, | ||||||
|  |     0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, | ||||||
|  |     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 } | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | /* Horizontal filter:  pixel_step is 1, output_height and output_width are | ||||||
|  |    the size of horizontal filtering output, output_height is always H + 5 */ | ||||||
|  | static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr, | ||||||
|  |                                              uint16_t *output_ptr, | ||||||
|  |                                              unsigned int src_pixels_per_line, | ||||||
|  |                                              unsigned int output_height, | ||||||
|  |                                              unsigned int output_width, | ||||||
|  |                                              const int16_t *vp8_filter) { | ||||||
|  |   uint32_t tmp[1]; | ||||||
|  |   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; | ||||||
|  |  | ||||||
|  | #if _MIPS_SIM == _ABIO32 | ||||||
|  |   register double fzero asm("$f0"); | ||||||
|  |   register double ftmp0 asm("$f2"); | ||||||
|  |   register double ftmp1 asm("$f4"); | ||||||
|  |   register double ftmp2 asm("$f6"); | ||||||
|  |   register double ftmp3 asm("$f8"); | ||||||
|  |   register double ftmp4 asm("$f10"); | ||||||
|  |   register double ftmp5 asm("$f12"); | ||||||
|  |   register double ftmp6 asm("$f14"); | ||||||
|  |   register double ftmp7 asm("$f16"); | ||||||
|  |   register double ftmp8 asm("$f18"); | ||||||
|  |   register double ftmp9 asm("$f20"); | ||||||
|  |   register double ftmp10 asm("$f22"); | ||||||
|  | #else | ||||||
|  |   register double fzero asm("$f0"); | ||||||
|  |   register double ftmp0 asm("$f1"); | ||||||
|  |   register double ftmp1 asm("$f2"); | ||||||
|  |   register double ftmp2 asm("$f3"); | ||||||
|  |   register double ftmp3 asm("$f4"); | ||||||
|  |   register double ftmp4 asm("$f5"); | ||||||
|  |   register double ftmp5 asm("$f6"); | ||||||
|  |   register double ftmp6 asm("$f7"); | ||||||
|  |   register double ftmp7 asm("$f8"); | ||||||
|  |   register double ftmp8 asm("$f9"); | ||||||
|  |   register double ftmp9 asm("$f10"); | ||||||
|  |   register double ftmp10 asm("$f11"); | ||||||
|  | #endif  // _MIPS_SIM == _ABIO32 | ||||||
|  |  | ||||||
|  |   __asm__ volatile ( | ||||||
|  |     "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t" | ||||||
|  |     "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t" | ||||||
|  |     "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t" | ||||||
|  |     "ldc1       %[ftmp3],       0x30(%[vp8_filter])                   \n\t" | ||||||
|  |     "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t" | ||||||
|  |     "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t" | ||||||
|  |     "xor        %[fzero],       %[fzero],           %[fzero]          \n\t" | ||||||
|  |     "li         %[tmp0],        0x07                                  \n\t" | ||||||
|  |     "mtc1       %[tmp0],        %[ftmp7]                              \n\t" | ||||||
|  |     "li         %[tmp0],        0x08                                  \n\t" | ||||||
|  |     "mtc1       %[tmp0],        %[ftmp10]                             \n\t" | ||||||
|  |  | ||||||
|  |     "1:                                                               \n\t" | ||||||
|  |     "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t" | ||||||
|  |     "gsldrc1    %[ftmp9],      -0x02(%[src_ptr])                      \n\t" | ||||||
|  |  | ||||||
|  |     "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t" | ||||||
|  |     "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t" | ||||||
|  |  | ||||||
|  |     "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t" | ||||||
|  |     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t" | ||||||
|  |     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t" | ||||||
|  |  | ||||||
|  |     "gsldlc1    %[ftmp9],       0x06(%[src_ptr])                      \n\t" | ||||||
|  |     "gsldrc1    %[ftmp9],      -0x01(%[src_ptr])                      \n\t" | ||||||
|  |  | ||||||
|  |     "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t" | ||||||
|  |     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t" | ||||||
|  |     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t" | ||||||
|  |  | ||||||
|  |     "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t" | ||||||
|  |     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t" | ||||||
|  |     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t" | ||||||
|  |  | ||||||
|  |     "dsrl       %[ftmp9],       %[ftmp9],          %[ftmp10]          \n\t" | ||||||
|  |     "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t" | ||||||
|  |     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t" | ||||||
|  |     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t" | ||||||
|  |  | ||||||
|  |     "dsrl       %[ftmp9],       %[ftmp9],          %[ftmp10]          \n\t" | ||||||
|  |     "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t" | ||||||
|  |     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t" | ||||||
|  |     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t" | ||||||
|  |  | ||||||
|  |     "paddsh     %[ftmp8],       %[ftmp8],          %[ff_ph_40]        \n\t" | ||||||
|  |     "psrah      %[ftmp8],       %[ftmp8],          %[ftmp7]           \n\t" | ||||||
|  |     "packushb   %[ftmp8],       %[ftmp8],          %[fzero]           \n\t" | ||||||
|  |     "punpcklbh  %[ftmp8],       %[ftmp8],          %[fzero]           \n\t" | ||||||
|  |     "gssdlc1    %[ftmp8],       0x07(%[output_ptr])                   \n\t" | ||||||
|  |     "gssdrc1    %[ftmp8],       0x00(%[output_ptr])                   \n\t" | ||||||
|  |  | ||||||
|  |     "addiu      %[output_height], %[output_height], -0x01             \n\t" | ||||||
|  |     MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width]) | ||||||
|  |     MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line]) | ||||||
|  |     "bnez       %[output_height],               1b                    \n\t" | ||||||
|  |     : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0), | ||||||
|  |       [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2), | ||||||
|  |       [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4), | ||||||
|  |       [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6), | ||||||
|  |       [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8), | ||||||
|  |       [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10), | ||||||
|  |       [tmp0]"=&r"(tmp[0]),              [src_ptr]"+&r"(src_ptr), | ||||||
|  |       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height) | ||||||
|  |     : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), | ||||||
|  |       [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width), | ||||||
|  |       [ff_ph_40]"f"(ff_ph_40) | ||||||
|  |     : "memory" | ||||||
|  |     ); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /* Horizontal filter:  pixel_step is always W */ | ||||||
|  | static INLINE void vp8_filter_block1dc_v6_mmi( | ||||||
|  |     uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, | ||||||
|  |     int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) { | ||||||
|  |   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL }; | ||||||
|  |   uint32_t tmp[1]; | ||||||
|  |   mips_reg addr[1]; | ||||||
|  | #if _MIPS_SIM == _ABIO32 | ||||||
|  |   register double fzero asm("$f0"); | ||||||
|  |   register double ftmp0 asm("$f2"); | ||||||
|  |   register double ftmp1 asm("$f4"); | ||||||
|  |   register double ftmp2 asm("$f6"); | ||||||
|  |   register double ftmp3 asm("$f8"); | ||||||
|  |   register double ftmp4 asm("$f10"); | ||||||
|  |   register double ftmp5 asm("$f12"); | ||||||
|  |   register double ftmp6 asm("$f14"); | ||||||
|  |   register double ftmp7 asm("$f16"); | ||||||
|  |   register double ftmp8 asm("$f18"); | ||||||
|  | #else | ||||||
|  |   register double fzero asm("$f0"); | ||||||
|  |   register double ftmp0 asm("$f1"); | ||||||
|  |   register double ftmp1 asm("$f2"); | ||||||
|  |   register double ftmp2 asm("$f3"); | ||||||
|  |   register double ftmp3 asm("$f4"); | ||||||
|  |   register double ftmp4 asm("$f5"); | ||||||
|  |   register double ftmp5 asm("$f6"); | ||||||
|  |   register double ftmp6 asm("$f7"); | ||||||
|  |   register double ftmp7 asm("$f8"); | ||||||
|  |   register double ftmp8 asm("$f9"); | ||||||
|  | #endif  // _MIPS_SIM == _ABIO32 | ||||||
|  |  | ||||||
|  |   __asm__ volatile ( | ||||||
|  |     "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t" | ||||||
|  |     "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t" | ||||||
|  |     "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t" | ||||||
|  |     "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t" | ||||||
|  |     "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t" | ||||||
|  |     "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t" | ||||||
|  |     MMI_SUBU(%[src_ptr],   %[src_ptr],      %[pixels_per_line_x2]) | ||||||
|  |     "xor        %[fzero],     %[fzero],        %[fzero]               \n\t" | ||||||
|  |     "li         %[tmp0],      0x07                                    \n\t" | ||||||
|  |     "mtc1       %[tmp0],      %[ftmp7]                                \n\t" | ||||||
|  |  | ||||||
|  |     "1:                                                               \n\t" | ||||||
|  |     "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t" | ||||||
|  |     "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t" | ||||||
|  |     "pmullh     %[ftmp8],     %[ftmp6],        %[ftmp0]               \n\t" | ||||||
|  |  | ||||||
|  |     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line]) | ||||||
|  |     "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t" | ||||||
|  |     "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t" | ||||||
|  |     "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp1]               \n\t" | ||||||
|  |     "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t" | ||||||
|  |  | ||||||
|  |     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2]) | ||||||
|  |     "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t" | ||||||
|  |     "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t" | ||||||
|  |     "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp2]               \n\t" | ||||||
|  |     "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t" | ||||||
|  |  | ||||||
|  |     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4]) | ||||||
|  |     "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t" | ||||||
|  |     "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t" | ||||||
|  |     "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp4]               \n\t" | ||||||
|  |     "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t" | ||||||
|  |  | ||||||
|  |     MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line]) | ||||||
|  |     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2]) | ||||||
|  |     "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t" | ||||||
|  |     "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t" | ||||||
|  |     "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp3]               \n\t" | ||||||
|  |     "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t" | ||||||
|  |  | ||||||
|  |     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4]) | ||||||
|  |     "gsldlc1    %[ftmp6],     0x07(%[addr0])                          \n\t" | ||||||
|  |     "gsldrc1    %[ftmp6],     0x00(%[addr0])                          \n\t" | ||||||
|  |     "pmullh     %[ftmp6],     %[ftmp6],        %[ftmp5]               \n\t" | ||||||
|  |     "paddsh     %[ftmp8],     %[ftmp8],        %[ftmp6]               \n\t" | ||||||
|  |  | ||||||
|  |     "paddsh     %[ftmp8],     %[ftmp8],        %[ff_ph_40]            \n\t" | ||||||
|  |     "psrah      %[ftmp8],     %[ftmp8],        %[ftmp7]               \n\t" | ||||||
|  |     "packushb   %[ftmp8],     %[ftmp8],        %[fzero]               \n\t" | ||||||
|  |     "gsswlc1    %[ftmp8],     0x03(%[output_ptr])                     \n\t" | ||||||
|  |     "gsswrc1    %[ftmp8],     0x00(%[output_ptr])                     \n\t" | ||||||
|  |  | ||||||
|  |     MMI_ADDIU(%[output_height], %[output_height], -0x01) | ||||||
|  |     MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) | ||||||
|  |     "bnez       %[output_height], 1b                                  \n\t" | ||||||
|  |     : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0), | ||||||
|  |       [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2), | ||||||
|  |       [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4), | ||||||
|  |       [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6), | ||||||
|  |       [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8), | ||||||
|  |       [tmp0]"=&r"(tmp[0]),              [addr0]"=&r"(addr[0]), | ||||||
|  |       [src_ptr]"+&r"(src_ptr),          [output_ptr]"+&r"(output_ptr), | ||||||
|  |       [output_height]"+&r"(output_height) | ||||||
|  |     : [pixels_per_line]"r"((mips_reg)pixels_per_line), | ||||||
|  |       [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)), | ||||||
|  |       [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)), | ||||||
|  |       [vp8_filter]"r"(vp8_filter), | ||||||
|  |       [output_pitch]"r"((mips_reg)output_pitch), | ||||||
|  |       [ff_ph_40]"f"(ff_ph_40) | ||||||
|  |     : "memory" | ||||||
|  |     ); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /* When xoffset == 0, vp8_filter= {0,0,128,0,0,0}, | ||||||
|  |    function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can | ||||||
|  |    be simplified */ | ||||||
|  | static INLINE void vp8_filter_block1d_h6_filter0_mmi( | ||||||
|  |     unsigned char *src_ptr, uint16_t *output_ptr, | ||||||
|  |     unsigned int src_pixels_per_line, unsigned int output_height, | ||||||
|  |     unsigned int output_width) { | ||||||
|  | #if _MIPS_SIM == _ABIO32 | ||||||
|  |   register double fzero asm("$f0"); | ||||||
|  |   register double ftmp0 asm("$f2"); | ||||||
|  |   register double ftmp1 asm("$f4"); | ||||||
|  | #else | ||||||
|  |   register double fzero asm("$f0"); | ||||||
|  |   register double ftmp0 asm("$f1"); | ||||||
|  |   register double ftmp1 asm("$f2"); | ||||||
|  | #endif  // _MIPS_SIM == _ABIO32 | ||||||
|  |  | ||||||
|  |   __asm__ volatile ( | ||||||
|  |     "xor        %[fzero],       %[fzero],           %[fzero]          \n\t" | ||||||
|  |  | ||||||
|  |     "1:                                                               \n\t" | ||||||
|  |     "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t" | ||||||
|  |     "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t" | ||||||
|  |  | ||||||
|  |     "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t" | ||||||
|  |     "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t" | ||||||
|  |     "gssdrc1    %[ftmp1],       0x00(%[output_ptr])                   \n\t" | ||||||
|  |  | ||||||
|  |     "addiu      %[output_height], %[output_height], -0x01             \n\t" | ||||||
|  |     MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width]) | ||||||
|  |     MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line]) | ||||||
|  |     "bnez       %[output_height],               1b                    \n\t" | ||||||
|  |     : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0), | ||||||
|  |       [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr), | ||||||
|  |       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height) | ||||||
|  |     : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line), | ||||||
|  |       [output_width]"r"(output_width) | ||||||
|  |     : "memory" | ||||||
|  |     ); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | static INLINE void vp8_filter_block1dc_v6_filter0_mmi( | ||||||
|  |     uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height, | ||||||
|  |     int output_pitch, unsigned int pixels_per_line) { | ||||||
|  | #if _MIPS_SIM == _ABIO32 | ||||||
|  |   register double fzero asm("$f0"); | ||||||
|  |   register double ftmp0 asm("$f2"); | ||||||
|  |   register double ftmp1 asm("$f4"); | ||||||
|  | #else | ||||||
|  |   register double fzero asm("$f0"); | ||||||
|  |   register double ftmp0 asm("$f1"); | ||||||
|  |   register double ftmp1 asm("$f2"); | ||||||
|  | #endif  // _MIPS_SIM == _ABIO32 | ||||||
|  |  | ||||||
|  |   __asm__ volatile ( | ||||||
|  |     "xor        %[fzero],     %[fzero],        %[fzero]               \n\t" | ||||||
|  |  | ||||||
|  |     "1:                                                               \n\t" | ||||||
|  |     "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t" | ||||||
|  |     "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t" | ||||||
|  |     "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t" | ||||||
|  |     "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t" | ||||||
|  |     "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t" | ||||||
|  |  | ||||||
|  |     MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line]) | ||||||
|  |     MMI_ADDIU(%[output_height], %[output_height], -0x01) | ||||||
|  |     MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch]) | ||||||
|  |     "bnez       %[output_height], 1b                                  \n\t" | ||||||
|  |     : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0), | ||||||
|  |       [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr), | ||||||
|  |       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height) | ||||||
|  |     : [pixels_per_line]"r"((mips_reg)pixels_per_line), | ||||||
|  |       [output_pitch]"r"((mips_reg)output_pitch) | ||||||
|  |     : "memory" | ||||||
|  |     ); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #define sixtapNxM(n, m)                                                        \ | ||||||
|  |   void vp8_sixtap_predict##n##x##m##_mmi(                                      \ | ||||||
|  |       unsigned char *src_ptr, int src_pixels_per_line, int xoffset,            \ | ||||||
|  |       int yoffset, unsigned char *dst_ptr, int dst_pitch) {                    \ | ||||||
|  |     DECLARE_ALIGNED(16, uint16_t,                                              \ | ||||||
|  |                     FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]);     \ | ||||||
|  |     const int16_t *HFilter, *VFilter;                                          \ | ||||||
|  |     int i, loop = n / 4;                                                       \ | ||||||
|  |     HFilter = vp8_six_tap_mmi[xoffset];                                        \ | ||||||
|  |     VFilter = vp8_six_tap_mmi[yoffset];                                        \ | ||||||
|  |                                                                                \ | ||||||
|  |     if (xoffset == 0) {                                                        \ | ||||||
|  |       for (i = 0; i < loop; ++i) {                                             \ | ||||||
|  |         vp8_filter_block1d_h6_filter0_mmi(                                     \ | ||||||
|  |             src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4,       \ | ||||||
|  |             src_pixels_per_line, m + 5, n * 2);                                \ | ||||||
|  |       }                                                                        \ | ||||||
|  |     } else {                                                                   \ | ||||||
|  |       for (i = 0; i < loop; ++i) {                                             \ | ||||||
|  |         vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \ | ||||||
|  |                                   FData2 + i * 4, src_pixels_per_line, m + 5,  \ | ||||||
|  |                                   n * 2, HFilter);                             \ | ||||||
|  |       }                                                                        \ | ||||||
|  |     }                                                                          \ | ||||||
|  |     if (yoffset == 0) {                                                        \ | ||||||
|  |       for (i = 0; i < loop; ++i) {                                             \ | ||||||
|  |         vp8_filter_block1dc_v6_filter0_mmi(                                    \ | ||||||
|  |             FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2);     \ | ||||||
|  |       }                                                                        \ | ||||||
|  |     } else {                                                                   \ | ||||||
|  |       for (i = 0; i < loop; ++i) {                                             \ | ||||||
|  |         vp8_filter_block1dc_v6_mmi(FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, \ | ||||||
|  |                                    dst_pitch, n * 2, VFilter);                 \ | ||||||
|  |       }                                                                        \ | ||||||
|  |     }                                                                          \ | ||||||
|  |   } | ||||||
|  |  | ||||||
|  | sixtapNxM(4, 4); | ||||||
|  | sixtapNxM(8, 8); | ||||||
|  | sixtapNxM(8, 4); | ||||||
|  | sixtapNxM(16, 16); | ||||||
| @@ -132,16 +132,16 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") { | |||||||
| # Subpixel | # Subpixel | ||||||
| # | # | ||||||
| add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa/; | specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa/; | specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa/; | specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/; | specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/; | specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/; | ||||||
|   | |||||||
| @@ -116,6 +116,9 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c | |||||||
| VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c | VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c | ||||||
| VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h | VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h | ||||||
|  |  | ||||||
|  | # common (c) | ||||||
|  | VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c | ||||||
|  |  | ||||||
| ifeq ($(CONFIG_POSTPROC),yes) | ifeq ($(CONFIG_POSTPROC),yes) | ||||||
| VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c | VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c | ||||||
| endif | endif | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Shiyou Yin
					Shiyou Yin