diff --git a/doc/mips.txt b/doc/mips.txt index 774a9c2689..6fa6fb4baf 100644 --- a/doc/mips.txt +++ b/doc/mips.txt @@ -59,6 +59,7 @@ Files that have MIPS copyright notice in them: dsputil_mips.c fft_mips.c fft_table.h + fft_init_table.c fmtconvert_mips.c mpegaudiodsp_mips_fixed.c mpegaudiodsp_mips_float.c diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 5e4df5dea0..8ac425475c 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -3173,6 +3173,7 @@ av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx) if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx); if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx); if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx); + if (HAVE_MIPSFPU) ff_dsputil_init_mips (c, avctx); for (i = 0; i < 4; i++) { for (j = 0; j < 16; j++) { diff --git a/libavcodec/dsputil.h b/libavcodec/dsputil.h index 1d4650eab2..85ac20a6ff 100644 --- a/libavcodec/dsputil.h +++ b/libavcodec/dsputil.h @@ -622,6 +622,7 @@ void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx); void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx); void ff_dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx); void ff_dsputil_init_vis(DSPContext* c, AVCodecContext *avctx); +void ff_dsputil_init_mips(DSPContext* c, AVCodecContext *avctx); void ff_dsputil_init_dwt(DSPContext *c); void ff_mlp_init(DSPContext* c, AVCodecContext *avctx); diff --git a/libavcodec/fft.c b/libavcodec/fft.c index 6b93a5cdf3..e5bdcbd7ab 100644 --- a/libavcodec/fft.c +++ b/libavcodec/fft.c @@ -162,6 +162,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) if (HAVE_ALTIVEC) ff_fft_init_altivec(s); if (HAVE_MMX) ff_fft_init_mmx(s); if (CONFIG_MDCT) s->mdct_calcw = s->mdct_calc; + if (HAVE_MIPSFPU) ff_fft_init_mips(s); #else if (CONFIG_MDCT) s->mdct_calcw = ff_mdct_calcw_c; if (ARCH_ARM) ff_fft_fixed_init_arm(s); diff --git a/libavcodec/fft.h b/libavcodec/fft.h index 0e19e947b1..15e5a121d2 100644 --- a/libavcodec/fft.h +++ b/libavcodec/fft.h @@ -137,6 +137,7 @@ int ff_fft_init(FFTContext *s, int nbits, int inverse); void ff_fft_init_altivec(FFTContext *s); void ff_fft_init_mmx(FFTContext *s); void ff_fft_init_arm(FFTContext *s); +void ff_fft_init_mips(FFTContext *s); #else void ff_fft_fixed_init_arm(FFTContext *s); #endif diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c index 7f2dc7f038..372f2a36ce 100644 --- a/libavcodec/fmtconvert.c +++ b/libavcodec/fmtconvert.c @@ -86,6 +86,7 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx) if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx); if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx); if (HAVE_MMX) ff_fmt_convert_init_x86(c, avctx); + if (HAVE_MIPSFPU) ff_fmt_convert_init_mips(c); } /* ffdshow custom code */ diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h index a9fbb31dea..ab2caa2089 100644 --- a/libavcodec/fmtconvert.h +++ b/libavcodec/fmtconvert.h @@ -92,6 +92,7 @@ av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx); void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx); void ff_fmt_convert_init_altivec(FmtConvertContext *c, AVCodecContext *avctx); void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx); +void ff_fmt_convert_init_mips(FmtConvertContext *c); /* ffdshow custom code */ void float_interleave(float *dst, const float **src, long len, int channels); diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile index 67960b173d..ff46768429 100644 --- a/libavcodec/mips/Makefile +++ b/libavcodec/mips/Makefile @@ -13,3 +13,7 @@ MIPSFPU-OBJS-$(CONFIG_AMRWB_DECODER) += mips/acelp_filters_mips.o \ mips/acelp_vectors_mips.o MIPSFPU-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_float.o MIPSDSPR1-OBJS-$(CONFIG_MPEGAUDIODSP) += mips/mpegaudiodsp_mips_fixed.o +OBJS-$(CONFIG_FFT) += mips/fft_init_table.o +MIPSFPU-OBJS-$(CONFIG_FFT) += mips/fft_mips.o +MIPSFPU-OBJS-$(HAVE_INLINE_ASM) += mips/fmtconvert_mips.o +MIPSFPU-OBJS-$(HAVE_INLINE_ASM) += mips/dsputil_mips.o diff --git a/libavcodec/mips/dsputil_mips.c b/libavcodec/mips/dsputil_mips.c new file mode 100644 index 0000000000..e46a0a949f --- /dev/null +++ b/libavcodec/mips/dsputil_mips.c @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Zoran Lukic (zoranl@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "config.h" +#include "libavcodec/dsputil.h" + +static void vector_fmul_window_mips(float *dst, const float *src0, + const float *src1, const float *win, int len) +{ + int i, j; + /* + * variables used in inline assembler + */ + float * dst_i, * dst_j, * dst_i2, * dst_j2; + float temp, temp1, temp2, temp3, temp4, temp5, temp6, temp7; + + dst += len; + win += len; + src0 += len; + + for (i = -len, j = len - 1; i < 0; i += 8, j -= 8) { + + dst_i = dst + i; + dst_j = dst + j; + + dst_i2 = dst + i + 4; + dst_j2 = dst + j - 4; + + __asm__ volatile ( + "mul.s %[temp], %[s1], %[wi] \n\t" + "mul.s %[temp1], %[s1], %[wj] \n\t" + "mul.s %[temp2], %[s11], %[wi1] \n\t" + "mul.s %[temp3], %[s11], %[wj1] \n\t" + + "msub.s %[temp], %[temp], %[s0], %[wj] \n\t" + "madd.s %[temp1], %[temp1], %[s0], %[wi] \n\t" + "msub.s %[temp2], %[temp2], %[s01], %[wj1] \n\t" + "madd.s %[temp3], %[temp3], %[s01], %[wi1] \n\t" + + "swc1 %[temp], 0(%[dst_i]) \n\t" /* dst[i] = s0*wj - s1*wi; */ + "swc1 %[temp1], 0(%[dst_j]) \n\t" /* dst[j] = s0*wi + s1*wj; */ + "swc1 %[temp2], 4(%[dst_i]) \n\t" /* dst[i+1] = s01*wj1 - s11*wi1; */ + "swc1 %[temp3], -4(%[dst_j]) \n\t" /* dst[j-1] = s01*wi1 + s11*wj1; */ + + "mul.s %[temp4], %[s12], %[wi2] \n\t" + "mul.s %[temp5], %[s12], %[wj2] \n\t" + "mul.s %[temp6], %[s13], %[wi3] \n\t" + "mul.s %[temp7], %[s13], %[wj3] \n\t" + + "msub.s %[temp4], %[temp4], %[s02], %[wj2] \n\t" + "madd.s %[temp5], %[temp5], %[s02], %[wi2] \n\t" + "msub.s %[temp6], %[temp6], %[s03], %[wj3] \n\t" + "madd.s %[temp7], %[temp7], %[s03], %[wi3] \n\t" + + "swc1 %[temp4], 8(%[dst_i]) \n\t" /* dst[i+2] = s02*wj2 - s12*wi2; */ + "swc1 %[temp5], -8(%[dst_j]) \n\t" /* dst[j-2] = s02*wi2 + s12*wj2; */ + "swc1 %[temp6], 12(%[dst_i]) \n\t" /* dst[i+2] = s03*wj3 - s13*wi3; */ + "swc1 %[temp7], -12(%[dst_j]) \n\t" /* dst[j-3] = s03*wi3 + s13*wj3; */ + : [temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), + [temp6]"=&f"(temp6), [temp7]"=&f"(temp7) + : [dst_j]"r"(dst_j), [dst_i]"r" (dst_i), + [s0] "f"(src0[i]), [wj] "f"(win[j]), [s1] "f"(src1[j]), + [wi] "f"(win[i]), [s01]"f"(src0[i + 1]),[wj1]"f"(win[j - 1]), + [s11]"f"(src1[j - 1]), [wi1]"f"(win[i + 1]), [s02]"f"(src0[i + 2]), + [wj2]"f"(win[j - 2]), [s12]"f"(src1[j - 2]),[wi2]"f"(win[i + 2]), + [s03]"f"(src0[i + 3]), [wj3]"f"(win[j - 3]), [s13]"f"(src1[j - 3]), + [wi3]"f"(win[i + 3]) + : "memory" + ); + + __asm__ volatile ( + "mul.s %[temp], %[s1], %[wi] \n\t" + "mul.s %[temp1], %[s1], %[wj] \n\t" + "mul.s %[temp2], %[s11], %[wi1] \n\t" + "mul.s %[temp3], %[s11], %[wj1] \n\t" + + "msub.s %[temp], %[temp], %[s0], %[wj] \n\t" + "madd.s %[temp1], %[temp1], %[s0], %[wi] \n\t" + "msub.s %[temp2], %[temp2], %[s01], %[wj1] \n\t" + "madd.s %[temp3], %[temp3], %[s01], %[wi1] \n\t" + + "swc1 %[temp], 0(%[dst_i2]) \n\t" /* dst[i] = s0*wj - s1*wi; */ + "swc1 %[temp1], 0(%[dst_j2]) \n\t" /* dst[j] = s0*wi + s1*wj; */ + "swc1 %[temp2], 4(%[dst_i2]) \n\t" /* dst[i+1] = s01*wj1 - s11*wi1; */ + "swc1 %[temp3], -4(%[dst_j2]) \n\t" /* dst[j-1] = s01*wi1 + s11*wj1; */ + + "mul.s %[temp4], %[s12], %[wi2] \n\t" + "mul.s %[temp5], %[s12], %[wj2] \n\t" + "mul.s %[temp6], %[s13], %[wi3] \n\t" + "mul.s %[temp7], %[s13], %[wj3] \n\t" + + "msub.s %[temp4], %[temp4], %[s02], %[wj2] \n\t" + "madd.s %[temp5], %[temp5], %[s02], %[wi2] \n\t" + "msub.s %[temp6], %[temp6], %[s03], %[wj3] \n\t" + "madd.s %[temp7], %[temp7], %[s03], %[wi3] \n\t" + + "swc1 %[temp4], 8(%[dst_i2]) \n\t" /* dst[i+2] = s02*wj2 - s12*wi2; */ + "swc1 %[temp5], -8(%[dst_j2]) \n\t" /* dst[j-2] = s02*wi2 + s12*wj2; */ + "swc1 %[temp6], 12(%[dst_i2]) \n\t" /* dst[i+2] = s03*wj3 - s13*wi3; */ + "swc1 %[temp7], -12(%[dst_j2]) \n\t" /* dst[j-3] = s03*wi3 + s13*wj3; */ + : [temp]"=&f"(temp), + [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), [temp3]"=&f"(temp3), + [temp4]"=&f"(temp4), [temp5]"=&f"(temp5), [temp6]"=&f"(temp6), + [temp7] "=&f" (temp7) + : [dst_j2]"r"(dst_j2), [dst_i2]"r"(dst_i2), + [s0] "f"(src0[i + 4]), [wj] "f"(win[j - 4]), [s1] "f"(src1[j - 4]), + [wi] "f"(win[i + 4]), [s01]"f"(src0[i + 5]),[wj1]"f"(win[j - 5]), + [s11]"f"(src1[j - 5]), [wi1]"f"(win[i + 5]), [s02]"f"(src0[i + 6]), + [wj2]"f"(win[j - 6]), [s12]"f"(src1[j - 6]),[wi2]"f"(win[i + 6]), + [s03]"f"(src0[i + 7]), [wj3]"f"(win[j - 7]), [s13]"f"(src1[j - 7]), + [wi3]"f"(win[i + 7]) + : "memory" + ); + } +} + +av_cold void ff_dsputil_init_mips( DSPContext* c, AVCodecContext *avctx ) +{ + c->vector_fmul_window = vector_fmul_window_mips; +} diff --git a/libavcodec/mips/fft_init_table.c b/libavcodec/mips/fft_init_table.c new file mode 100644 index 0000000000..9c2e998e9c --- /dev/null +++ b/libavcodec/mips/fft_init_table.c @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Stanislav Ocovaj (socovaj@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * definitions and initialization of LUT table for MIPS FFT + */ +#include "fft_table.h" + +uint16_t fft_offsets_lut[0x2aab]; + +void ff_fft_lut_init(uint16_t *table, int off, int size, int *index) +{ + if (size < 16) { + table[*index] = off >> 2; + (*index)++; + } + else { + ff_fft_lut_init(table, off, size>>1, index); + ff_fft_lut_init(table, off+(size>>1), size>>2, index); + ff_fft_lut_init(table, off+3*(size>>2), size>>2, index); + } +} diff --git a/libavcodec/mips/fft_mips.c b/libavcodec/mips/fft_mips.c new file mode 100644 index 0000000000..2b1c50899c --- /dev/null +++ b/libavcodec/mips/fft_mips.c @@ -0,0 +1,530 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Stanislav Ocovaj (socovaj@mips.com) + * Author: Zoran Lukic (zoranl@mips.com) + * + * Optimized MDCT/IMDCT and FFT transforms + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "config.h" +#include "libavcodec/fft.h" +#include "fft_table.h" + +/** + * FFT transform + */ + +#if HAVE_INLINE_ASM +static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z) +{ + int nbits, i, n, num_transforms, offset, step; + int n4, n2, n34; + FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + FFTComplex *tmpz; + float w_re, w_im; + float *w_re_ptr, *w_im_ptr; + const int fft_size = (1 << s->nbits); + int s_n = s->nbits; + int tem1, tem2; + float pom, pom1, pom2, pom3; + float temp, temp1, temp3, temp4; + FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4; + FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i; + + /** + *num_transforms = (0x2aab >> (16 - s->nbits)) | 1; + */ + __asm__ volatile ( + "li %[tem1], 16 \n\t" + "sub %[s_n], %[tem1], %[s_n] \n\t" + "li %[tem2], 10923 \n\t" + "srav %[tem2], %[tem2], %[s_n] \n\t" + "ori %[num_t],%[tem2], 1 \n\t" + : [num_t]"=r"(num_transforms), [s_n]"+r"(s_n), + [tem1]"=&r"(tem1), [tem2]"=&r"(tem2) + ); + + + for (n=0; n> 1) | 1; + + for (n=0; nnbits; nbits++) { + /* + * num_transforms = (num_transforms >> 1) | 1; + */ + __asm__ volatile ( + "sra %[num_t], %[num_t], 1 \n\t" + "ori %[num_t], %[num_t], 1 \n\t" + + : [num_t] "+r" (num_transforms) + ); + n2 = 2 * n4; + n34 = 3 * n4; + + for (n=0; n>= 1; + n4 <<= 1; + } +} + +/** + * MDCT/IMDCT transforms. + */ + +static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + int k, n8, n4, n2, n, j; + const uint16_t *revtab = s->revtab; + const FFTSample *tcos = s->tcos; + const FFTSample *tsin = s->tsin; + const FFTSample *in1, *in2, *in3, *in4; + FFTComplex *z = (FFTComplex *)output; + + int j1; + const float *tcos1, *tsin1, *tcos2, *tsin2; + float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, + temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; + FFTComplex *z1, *z2; + + n = 1 << s->mdct_bits; + n2 = n >> 1; + n4 = n >> 2; + n8 = n >> 3; + + /* pre rotation */ + in1 = input; + in2 = input + n2 - 1; + in3 = input + 2; + in4 = input + n2 - 3; + + tcos1 = tcos; + tsin1 = tsin; + + /* n4 = 64 or 128 */ + for(k = 0; k < n4; k += 2) { + j = revtab[k ]; + j1 = revtab[k + 1]; + + __asm__ volatile ( + "lwc1 %[temp1], 0(%[in2]) \t\n" + "lwc1 %[temp2], 0(%[tcos1]) \t\n" + "lwc1 %[temp3], 0(%[tsin1]) \t\n" + "lwc1 %[temp4], 0(%[in1]) \t\n" + "lwc1 %[temp5], 0(%[in4]) \t\n" + "mul.s %[temp9], %[temp1], %[temp2] \t\n" + "mul.s %[temp10], %[temp1], %[temp3] \t\n" + "lwc1 %[temp6], 4(%[tcos1]) \t\n" + "lwc1 %[temp7], 4(%[tsin1]) \t\n" + "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n" + "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n" + "mul.s %[temp11], %[temp5], %[temp6] \t\n" + "mul.s %[temp12], %[temp5], %[temp7] \t\n" + "lwc1 %[temp8], 0(%[in3]) \t\n" + "addiu %[tcos1], %[tcos1], 8 \t\n" + "addiu %[tsin1], %[tsin1], 8 \t\n" + "addiu %[in1], %[in1], 16 \t\n" + "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n" + "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n" + "addiu %[in2], %[in2], -16 \t\n" + "addiu %[in3], %[in3], 16 \t\n" + "addiu %[in4], %[in4], -16 \t\n" + + : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), + [temp5]"=&f"(temp5), [temp6]"=&f"(temp6), + [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), + [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), + [temp11]"=&f"(temp11), [temp12]"=&f"(temp12), + [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1), + [in1]"+r"(in1), [in2]"+r"(in2), + [in3]"+r"(in3), [in4]"+r"(in4) + ); + + z[j ].re = temp9; + z[j ].im = temp10; + z[j1].re = temp11; + z[j1].im = temp12; + } + + s->fft_calc(s, z); + + /* post rotation + reordering */ + /* n8 = 32 or 64 */ + for(k = 0; k < n8; k += 2) { + tcos1 = &tcos[n8 - k - 2]; + tsin1 = &tsin[n8 - k - 2]; + tcos2 = &tcos[n8 + k]; + tsin2 = &tsin[n8 + k]; + z1 = &z[n8 - k - 2]; + z2 = &z[n8 + k ]; + + __asm__ volatile ( + "lwc1 %[temp1], 12(%[z1]) \t\n" + "lwc1 %[temp2], 4(%[tsin1]) \t\n" + "lwc1 %[temp3], 4(%[tcos1]) \t\n" + "lwc1 %[temp4], 8(%[z1]) \t\n" + "lwc1 %[temp5], 4(%[z1]) \t\n" + "mul.s %[temp9], %[temp1], %[temp2] \t\n" + "mul.s %[temp10], %[temp1], %[temp3] \t\n" + "lwc1 %[temp6], 0(%[tsin1]) \t\n" + "lwc1 %[temp7], 0(%[tcos1]) \t\n" + "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n" + "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n" + "mul.s %[temp11], %[temp5], %[temp6] \t\n" + "mul.s %[temp12], %[temp5], %[temp7] \t\n" + "lwc1 %[temp8], 0(%[z1]) \t\n" + "lwc1 %[temp1], 4(%[z2]) \t\n" + "lwc1 %[temp2], 0(%[tsin2]) \t\n" + "lwc1 %[temp3], 0(%[tcos2]) \t\n" + "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n" + "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n" + "mul.s %[temp13], %[temp1], %[temp2] \t\n" + "mul.s %[temp14], %[temp1], %[temp3] \t\n" + "lwc1 %[temp4], 0(%[z2]) \t\n" + "lwc1 %[temp5], 12(%[z2]) \t\n" + "lwc1 %[temp6], 4(%[tsin2]) \t\n" + "lwc1 %[temp7], 4(%[tcos2]) \t\n" + "nmsub.s %[temp13], %[temp13], %[temp4], %[temp3] \t\n" + "madd.s %[temp14], %[temp14], %[temp4], %[temp2] \t\n" + "mul.s %[temp15], %[temp5], %[temp6] \t\n" + "mul.s %[temp16], %[temp5], %[temp7] \t\n" + "lwc1 %[temp8], 8(%[z2]) \t\n" + "nmsub.s %[temp15], %[temp15], %[temp8], %[temp7] \t\n" + "madd.s %[temp16], %[temp16], %[temp8], %[temp6] \t\n" + : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2), + [temp3]"=&f"(temp3), [temp4]"=&f"(temp4), + [temp5]"=&f"(temp5), [temp6]"=&f"(temp6), + [temp7]"=&f"(temp7), [temp8]"=&f"(temp8), + [temp9]"=&f"(temp9), [temp10]"=&f"(temp10), + [temp11]"=&f"(temp11), [temp12]"=&f"(temp12), + [temp13]"=&f"(temp13), [temp14]"=&f"(temp14), + [temp15]"=&f"(temp15), [temp16]"=&f"(temp16) + : [z1]"r"(z1), [z2]"r"(z2), + [tsin1]"r"(tsin1), [tcos1]"r"(tcos1), + [tsin2]"r"(tsin2), [tcos2]"r"(tcos2) + ); + + z1[1].re = temp9; + z1[1].im = temp14; + z2[0].re = temp13; + z2[0].im = temp10; + + z1[0].re = temp11; + z1[0].im = temp16; + z2[1].re = temp15; + z2[1].im = temp12; + } +} +#endif /* HAVE_INLINE_ASM */ + +/** + * Compute inverse MDCT of size N = 2^nbits + * @param output N samples + * @param input N/2 samples + */ +static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input) +{ + int k; + int n = 1 << s->mdct_bits; + int n2 = n >> 1; + int n4 = n >> 2; + + ff_imdct_half_mips(s, output+n4, input); + + for(k = 0; k < n4; k+=4) { + output[k] = -output[n2-k-1]; + output[k+1] = -output[n2-k-2]; + output[k+2] = -output[n2-k-3]; + output[k+3] = -output[n2-k-4]; + + output[n-k-1] = output[n2+k]; + output[n-k-2] = output[n2+k+1]; + output[n-k-3] = output[n2+k+2]; + output[n-k-4] = output[n2+k+3]; + } +} + +av_cold void ff_fft_init_mips(FFTContext *s) +{ + int n=0; + + ff_fft_lut_init(fft_offsets_lut, 0, 1 << 16, &n); + +#if HAVE_INLINE_ASM + s->fft_calc = ff_fft_calc_mips; +#endif +#if CONFIG_MDCT + s->imdct_calc = ff_imdct_calc_mips; + s->imdct_half = ff_imdct_half_mips; +#endif +} diff --git a/libavcodec/mips/fft_table.h b/libavcodec/mips/fft_table.h new file mode 100644 index 0000000000..dd52eaf8c8 --- /dev/null +++ b/libavcodec/mips/fft_table.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Stanislav Ocovaj (socovaj@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * definitions and LUT table for MIPS FFT + */ +#ifndef AVCODEC_MIPS_FFT_TABLE_H +#define AVCODEC_MIPS_FFT_TABLE_H + +#include "libavcodec/fft.h" + +#define MAX_LOG2_NFFT 16 //!< Specifies maxiumum allowed fft size +#define MAX_FFT_SIZE (1 << MAX_LOG2_NFFT) + +extern uint16_t fft_offsets_lut[]; +void ff_fft_lut_init(uint16_t *table, int off, int size, int *index); + +#endif /* AVCODEC_MIPS_FFT_TABLE_H */ diff --git a/libavcodec/mips/fmtconvert_mips.c b/libavcodec/mips/fmtconvert_mips.c new file mode 100644 index 0000000000..f89d3b6d64 --- /dev/null +++ b/libavcodec/mips/fmtconvert_mips.c @@ -0,0 +1,338 @@ +/* + * Format Conversion Utils for MIPS + * + * Copyright (c) 2012 + * MIPS Technologies, Inc., California. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of is + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Author: Zoran Lukic (zoranl@mips.com) + * Author: Nedeljko Babic (nbabic@mips.com) + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "config.h" +#include "libavcodec/avcodec.h" +#include "libavcodec/fmtconvert.h" + +#if HAVE_MIPSDSPR1 +static void float_to_int16_mips(int16_t *dst, const float *src, long len) +{ + const float *src_end = src + len; + int ret0, ret1, ret2, ret3, ret4, ret5, ret6, ret7; + float src0, src1, src2, src3, src4, src5, src6, src7; + + /* + * loop is 8 times unrolled in assembler in order to achieve better performance + */ + __asm__ volatile( + "beq %[len], $zero, fti16_end%= \n\t" + "fti16_lp%=: \n\t" + "lwc1 %[src0], 0(%[src]) \n\t" + "lwc1 %[src1], 4(%[src]) \n\t" + "lwc1 %[src2], 8(%[src]) \n\t" + "lwc1 %[src3], 12(%[src]) \n\t" + "cvt.w.s %[src0], %[src0] \n\t" + "cvt.w.s %[src1], %[src1] \n\t" + "cvt.w.s %[src2], %[src2] \n\t" + "cvt.w.s %[src3], %[src3] \n\t" + "mfc1 %[ret0], %[src0] \n\t" + "mfc1 %[ret1], %[src1] \n\t" + "mfc1 %[ret2], %[src2] \n\t" + "mfc1 %[ret3], %[src3] \n\t" + "lwc1 %[src4], 16(%[src]) \n\t" + "lwc1 %[src5], 20(%[src]) \n\t" + "lwc1 %[src6], 24(%[src]) \n\t" + "lwc1 %[src7], 28(%[src]) \n\t" + "cvt.w.s %[src4], %[src4] \n\t" + "cvt.w.s %[src5], %[src5] \n\t" + "cvt.w.s %[src6], %[src6] \n\t" + "cvt.w.s %[src7], %[src7] \n\t" + "addiu %[src], 32 \n\t" + "shll_s.w %[ret0], %[ret0], 16 \n\t" + "shll_s.w %[ret1], %[ret1], 16 \n\t" + "shll_s.w %[ret2], %[ret2], 16 \n\t" + "shll_s.w %[ret3], %[ret3], 16 \n\t" + "srl %[ret0], %[ret0], 16 \n\t" + "srl %[ret1], %[ret1], 16 \n\t" + "srl %[ret2], %[ret2], 16 \n\t" + "srl %[ret3], %[ret3], 16 \n\t" + "sh %[ret0], 0(%[dst]) \n\t" + "sh %[ret1], 2(%[dst]) \n\t" + "sh %[ret2], 4(%[dst]) \n\t" + "sh %[ret3], 6(%[dst]) \n\t" + "mfc1 %[ret4], %[src4] \n\t" + "mfc1 %[ret5], %[src5] \n\t" + "mfc1 %[ret6], %[src6] \n\t" + "mfc1 %[ret7], %[src7] \n\t" + "shll_s.w %[ret4], %[ret4], 16 \n\t" + "shll_s.w %[ret5], %[ret5], 16 \n\t" + "shll_s.w %[ret6], %[ret6], 16 \n\t" + "shll_s.w %[ret7], %[ret7], 16 \n\t" + "srl %[ret4], %[ret4], 16 \n\t" + "srl %[ret5], %[ret5], 16 \n\t" + "srl %[ret6], %[ret6], 16 \n\t" + "srl %[ret7], %[ret7], 16 \n\t" + "sh %[ret4], 8(%[dst]) \n\t" + "sh %[ret5], 10(%[dst]) \n\t" + "sh %[ret6], 12(%[dst]) \n\t" + "sh %[ret7], 14(%[dst]) \n\t" + "addiu %[dst], 16 \n\t" + "bne %[src], %[src_end], fti16_lp%= \n\t" + "fti16_end%=: \n\t" + : [ret0]"=&r"(ret0), [ret1]"=&r"(ret1), [ret2]"=&r"(ret2), [ret3]"=&r"(ret3), + [ret4]"=&r"(ret4), [ret5]"=&r"(ret5), [ret6]"=&r"(ret6), [ret7]"=&r"(ret7), + [src0]"=&f"(src0), [src1]"=&f"(src1), [src2]"=&f"(src2), [src3]"=&f"(src3), + [src4]"=&f"(src4), [src5]"=&f"(src5), [src6]"=&f"(src6), [src7]"=&f"(src7), + [src]"+r"(src), [dst]"+r"(dst) + : [src_end]"r"(src_end), [len]"r"(len) + : "memory" + ); +} + +static void float_to_int16_interleave_mips(int16_t *dst, const float **src, long len, + int channels) +{ + int c, ch2 = channels <<1; + int ret0, ret1, ret2, ret3, ret4, ret5, ret6, ret7; + float src0, src1, src2, src3, src4, src5, src6, src7; + int16_t *dst_ptr0, *dst_ptr1, *dst_ptr2, *dst_ptr3; + int16_t *dst_ptr4, *dst_ptr5, *dst_ptr6, *dst_ptr7; + const float *src_ptr, *src_ptr2, *src_end; + + if (channels == 2) { + src_ptr = &src[0][0]; + src_ptr2 = &src[1][0]; + src_end = src_ptr + len; + + __asm__ volatile ( + "fti16i2_lp%=: \n\t" + "lwc1 %[src0], 0(%[src_ptr]) \n\t" + "lwc1 %[src1], 0(%[src_ptr2]) \n\t" + "addiu %[src_ptr], 4 \n\t" + "cvt.w.s $f9, %[src0] \n\t" + "cvt.w.s $f10, %[src1] \n\t" + "mfc1 %[ret0], $f9 \n\t" + "mfc1 %[ret1], $f10 \n\t" + "shll_s.w %[ret0], %[ret0], 16 \n\t" + "shll_s.w %[ret1], %[ret1], 16 \n\t" + "addiu %[src_ptr2], 4 \n\t" + "srl %[ret0], %[ret0], 16 \n\t" + "srl %[ret1], %[ret1], 16 \n\t" + "sh %[ret0], 0(%[dst]) \n\t" + "sh %[ret1], 2(%[dst]) \n\t" + "addiu %[dst], 4 \n\t" + "bne %[src_ptr], %[src_end], fti16i2_lp%= \n\t" + : [ret0]"=&r"(ret0), [ret1]"=&r"(ret1), + [src0]"=&f"(src0), [src1]"=&f"(src1), + [src_ptr]"+r"(src_ptr), [src_ptr2]"+r"(src_ptr2), + [dst]"+r"(dst) + : [src_end]"r"(src_end) + : "memory" + ); + } else { + for (c = 0; c < channels; c++) { + src_ptr = &src[c][0]; + dst_ptr0 = &dst[c]; + src_end = src_ptr + len; + /* + * loop is 8 times unrolled in assembler in order to achieve better performance + */ + __asm__ volatile( + "fti16i_lp%=: \n\t" + "lwc1 %[src0], 0(%[src_ptr]) \n\t" + "lwc1 %[src1], 4(%[src_ptr]) \n\t" + "lwc1 %[src2], 8(%[src_ptr]) \n\t" + "lwc1 %[src3], 12(%[src_ptr]) \n\t" + "cvt.w.s %[src0], %[src0] \n\t" + "cvt.w.s %[src1], %[src1] \n\t" + "cvt.w.s %[src2], %[src2] \n\t" + "cvt.w.s %[src3], %[src3] \n\t" + "mfc1 %[ret0], %[src0] \n\t" + "mfc1 %[ret1], %[src1] \n\t" + "mfc1 %[ret2], %[src2] \n\t" + "mfc1 %[ret3], %[src3] \n\t" + "lwc1 %[src4], 16(%[src_ptr]) \n\t" + "lwc1 %[src5], 20(%[src_ptr]) \n\t" + "lwc1 %[src6], 24(%[src_ptr]) \n\t" + "lwc1 %[src7], 28(%[src_ptr]) \n\t" + "addu %[dst_ptr1], %[dst_ptr0], %[ch2] \n\t" + "addu %[dst_ptr2], %[dst_ptr1], %[ch2] \n\t" + "addu %[dst_ptr3], %[dst_ptr2], %[ch2] \n\t" + "addu %[dst_ptr4], %[dst_ptr3], %[ch2] \n\t" + "addu %[dst_ptr5], %[dst_ptr4], %[ch2] \n\t" + "addu %[dst_ptr6], %[dst_ptr5], %[ch2] \n\t" + "addu %[dst_ptr7], %[dst_ptr6], %[ch2] \n\t" + "addiu %[src_ptr], 32 \n\t" + "cvt.w.s %[src4], %[src4] \n\t" + "cvt.w.s %[src5], %[src5] \n\t" + "cvt.w.s %[src6], %[src6] \n\t" + "cvt.w.s %[src7], %[src7] \n\t" + "shll_s.w %[ret0], %[ret0], 16 \n\t" + "shll_s.w %[ret1], %[ret1], 16 \n\t" + "shll_s.w %[ret2], %[ret2], 16 \n\t" + "shll_s.w %[ret3], %[ret3], 16 \n\t" + "srl %[ret0], %[ret0], 16 \n\t" + "srl %[ret1], %[ret1], 16 \n\t" + "srl %[ret2], %[ret2], 16 \n\t" + "srl %[ret3], %[ret3], 16 \n\t" + "sh %[ret0], 0(%[dst_ptr0]) \n\t" + "sh %[ret1], 0(%[dst_ptr1]) \n\t" + "sh %[ret2], 0(%[dst_ptr2]) \n\t" + "sh %[ret3], 0(%[dst_ptr3]) \n\t" + "mfc1 %[ret4], %[src4] \n\t" + "mfc1 %[ret5], %[src5] \n\t" + "mfc1 %[ret6], %[src6] \n\t" + "mfc1 %[ret7], %[src7] \n\t" + "shll_s.w %[ret4], %[ret4], 16 \n\t" + "shll_s.w %[ret5], %[ret5], 16 \n\t" + "shll_s.w %[ret6], %[ret6], 16 \n\t" + "shll_s.w %[ret7], %[ret7], 16 \n\t" + "srl %[ret4], %[ret4], 16 \n\t" + "srl %[ret5], %[ret5], 16 \n\t" + "srl %[ret6], %[ret6], 16 \n\t" + "srl %[ret7], %[ret7], 16 \n\t" + "sh %[ret4], 0(%[dst_ptr4]) \n\t" + "sh %[ret5], 0(%[dst_ptr5]) \n\t" + "sh %[ret6], 0(%[dst_ptr6]) \n\t" + "sh %[ret7], 0(%[dst_ptr7]) \n\t" + "addu %[dst_ptr0], %[dst_ptr7], %[ch2] \n\t" + "bne %[src_ptr], %[src_end], fti16i_lp%= \n\t" + : [ret0]"=&r"(ret0), [ret1]"=&r"(ret1), [ret2]"=&r"(ret2), [ret3]"=&r"(ret3), + [ret4]"=&r"(ret4), [ret5]"=&r"(ret5), [ret6]"=&r"(ret6), [ret7]"=&r"(ret7), + [src0]"=&f"(src0), [src1]"=&f"(src1), [src2]"=&f"(src2), [src3]"=&f"(src3), + [src4]"=&f"(src4), [src5]"=&f"(src5), [src6]"=&f"(src6), [src7]"=&f"(src7), + [dst_ptr1]"=&r"(dst_ptr1), [dst_ptr2]"=&r"(dst_ptr2), [dst_ptr3]"=&r"(dst_ptr3), + [dst_ptr4]"=&r"(dst_ptr4), [dst_ptr5]"=&r"(dst_ptr5), [dst_ptr6]"=&r"(dst_ptr6), + [dst_ptr7]"=&r"(dst_ptr7), [dst_ptr0]"+r"(dst_ptr0), [src_ptr]"+r"(src_ptr) + : [ch2]"r"(ch2), [src_end]"r"(src_end) + : "memory" + ); + } + } +} +#endif /* HAVE_MIPSDSPR1 */ + +static void int32_to_float_fmul_scalar_mips(float *dst, const int *src, + float mul, int len) +{ + /* + * variables used in inline assembler + */ + float temp1, temp3, temp5, temp7, temp9, temp11, temp13, temp15; + + int rpom1, rpom2, rpom11, rpom21, rpom12, rpom22, rpom13, rpom23; + const int *src_end = src + len; + /* + * loop is 8 times unrolled in assembler in order to achieve better performance + */ + __asm__ volatile ( + "i32tf_lp%=: \n\t" + "lw %[rpom11], 0(%[src]) \n\t" + "lw %[rpom21], 4(%[src]) \n\t" + "lw %[rpom1], 8(%[src]) \n\t" + "lw %[rpom2], 12(%[src]) \n\t" + "mtc1 %[rpom11], %[temp1] \n\t" + "mtc1 %[rpom21], %[temp3] \n\t" + "mtc1 %[rpom1], %[temp5] \n\t" + "mtc1 %[rpom2], %[temp7] \n\t" + + "lw %[rpom13], 16(%[src]) \n\t" + "lw %[rpom23], 20(%[src]) \n\t" + "lw %[rpom12], 24(%[src]) \n\t" + "lw %[rpom22], 28(%[src]) \n\t" + "mtc1 %[rpom13], %[temp9] \n\t" + "mtc1 %[rpom23], %[temp11] \n\t" + "mtc1 %[rpom12], %[temp13] \n\t" + "mtc1 %[rpom22], %[temp15] \n\t" + + "addiu %[src], 32 \n\t" + "cvt.s.w %[temp1], %[temp1] \n\t" + "cvt.s.w %[temp3], %[temp3] \n\t" + "cvt.s.w %[temp5], %[temp5] \n\t" + "cvt.s.w %[temp7], %[temp7] \n\t" + + "cvt.s.w %[temp9], %[temp9] \n\t" + "cvt.s.w %[temp11], %[temp11] \n\t" + "cvt.s.w %[temp13], %[temp13] \n\t" + "cvt.s.w %[temp15], %[temp15] \n\t" + + "mul.s %[temp1], %[temp1], %[mul] \n\t" + "mul.s %[temp3], %[temp3], %[mul] \n\t" + "mul.s %[temp5], %[temp5], %[mul] \n\t" + "mul.s %[temp7], %[temp7], %[mul] \n\t" + + "mul.s %[temp9], %[temp9], %[mul] \n\t" + "mul.s %[temp11], %[temp11], %[mul] \n\t" + "mul.s %[temp13], %[temp13], %[mul] \n\t" + "mul.s %[temp15], %[temp15], %[mul] \n\t" + + "swc1 %[temp1], 0(%[dst]) \n\t" /*dst[i] = src[i] * mul; */ + "swc1 %[temp3], 4(%[dst]) \n\t" /*dst[i+1] = src[i+1] * mul;*/ + "swc1 %[temp5], 8(%[dst]) \n\t" /*dst[i+2] = src[i+2] * mul;*/ + "swc1 %[temp7], 12(%[dst]) \n\t" /*dst[i+3] = src[i+3] * mul;*/ + + "swc1 %[temp9], 16(%[dst]) \n\t" /*dst[i+4] = src[i+4] * mul;*/ + "swc1 %[temp11], 20(%[dst]) \n\t" /*dst[i+5] = src[i+5] * mul;*/ + "swc1 %[temp13], 24(%[dst]) \n\t" /*dst[i+6] = src[i+6] * mul;*/ + "swc1 %[temp15], 28(%[dst]) \n\t" /*dst[i+7] = src[i+7] * mul;*/ + "addiu %[dst], 32 \n\t" + "bne %[src], %[src_end], i32tf_lp%= \n\t" + : [temp1]"=&f"(temp1), [temp11]"=&f"(temp11), + [temp13]"=&f"(temp13), [temp15]"=&f"(temp15), + [temp3]"=&f"(temp3), [temp5]"=&f"(temp5), + [temp7]"=&f"(temp7), [temp9]"=&f"(temp9), + [rpom1]"=&r"(rpom1), [rpom2]"=&r"(rpom2), + [rpom11]"=&r"(rpom11), [rpom21]"=&r"(rpom21), + [rpom12]"=&r"(rpom12), [rpom22]"=&r"(rpom22), + [rpom13]"=&r"(rpom13), [rpom23]"=&r"(rpom23), + [dst]"+r"(dst), [src]"+r"(src) + : [mul]"f"(mul), [src_end]"r"(src_end) + : "memory" + ); +} + +av_cold void ff_fmt_convert_init_mips(FmtConvertContext *c) +{ +#if HAVE_MIPSDSPR1 + c->float_to_int16_interleave = float_to_int16_interleave_mips; + c->float_to_int16 = float_to_int16_mips; +#endif + c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_mips; +}