; /* ; * SIMD optimized idct functions for HEVC decoding ; * Copyright (c) 2014 Pierre-Edouard LEPERE ; * Copyright (c) 2014 James Almer ; * ; * This file is part of FFmpeg. ; * ; * FFmpeg is free software; you can redistribute it and/or ; * modify it under the terms of the GNU Lesser General Public ; * License as published by the Free Software Foundation; either ; * version 2.1 of the License, or (at your option) any later version. ; * ; * FFmpeg is distributed in the hope that it will be useful, ; * but WITHOUT ANY WARRANTY; without even the implied warranty of ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ; * Lesser General Public License for more details. ; * ; * You should have received a copy of the GNU Lesser General Public ; * License along with FFmpeg; if not, write to the Free Software ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ; */ %include "libavutil/x86/x86util.asm" SECTION .text ; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs) ; %1 = HxW ; %2 = number of loops ; %3 = bitdepth %macro IDCT_DC 3 cglobal hevc_idct%1x%1_dc_%3, 1, 2, 1, coeff, tmp movsx tmpq, word [coeffq] add tmpw, ((1 << 14-%3) + 1) sar tmpw, (15-%3) movd xm0, tmpd SPLATW m0, xm0 DEFINE_ARGS coeff, cnt mov cntd, %2 .loop: mova [coeffq+mmsize*0], m0 mova [coeffq+mmsize*1], m0 mova [coeffq+mmsize*2], m0 mova [coeffq+mmsize*3], m0 mova [coeffq+mmsize*4], m0 mova [coeffq+mmsize*5], m0 mova [coeffq+mmsize*6], m0 mova [coeffq+mmsize*7], m0 add coeffq, mmsize*8 dec cntd jg .loop RET %endmacro ; %1 = HxW ; %2 = bitdepth %macro IDCT_DC_NL 2 ; No loop cglobal hevc_idct%1x%1_dc_%2, 1, 2, 1, coeff, tmp movsx tmpq, word [coeffq] add tmpw, ((1 << 14-%2) + 1) sar tmpw, (15-%2) movd m0, tmpd SPLATW m0, xm0 mova [coeffq+mmsize*0], m0 mova [coeffq+mmsize*1], m0 mova [coeffq+mmsize*2], m0 mova [coeffq+mmsize*3], m0 %if mmsize == 16 mova [coeffq+mmsize*4], m0 mova [coeffq+mmsize*5], m0 mova [coeffq+mmsize*6], m0 mova [coeffq+mmsize*7], m0 %endif RET %endmacro ; 8-bit INIT_MMX mmxext IDCT_DC_NL 4, 8 IDCT_DC 8, 2, 8 INIT_XMM sse2 IDCT_DC_NL 8, 8 IDCT_DC 16, 4, 8 IDCT_DC 32, 16, 8 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 IDCT_DC 16, 2, 8 IDCT_DC 32, 8, 8 %endif ;HAVE_AVX2_EXTERNAL ; 10-bit INIT_MMX mmxext IDCT_DC_NL 4, 10 IDCT_DC 8, 2, 10 INIT_XMM sse2 IDCT_DC_NL 8, 10 IDCT_DC 16, 4, 10 IDCT_DC 32, 16, 10 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 IDCT_DC 16, 2, 10 IDCT_DC 32, 8, 10 %endif ;HAVE_AVX2_EXTERNAL ; 12-bit INIT_MMX mmxext IDCT_DC_NL 4, 12 IDCT_DC 8, 2, 12 INIT_XMM sse2 IDCT_DC_NL 8, 12 IDCT_DC 16, 4, 12 IDCT_DC 32, 16, 12 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 IDCT_DC 16, 2, 12 IDCT_DC 32, 8, 12 %endif ;HAVE_AVX2_EXTERNAL