ffmpeg/libavcodec/x86/fmtconvert.asm

;******************************************************************************
;* x86 optimized Format Conversion Utils
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION .text

;------------------------------------------------------------------------------
; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
;                                    int len);
;------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_SCALAR 1
%if UNIX64
cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
%else
cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
%endif
%if WIN64
    SWAP 0, 2
%elif ARCH_X86_32
    movss   m0, mulm
%endif
    SPLATD  m0
    shl     lend, 2
    add     srcq, lenq
    add     dstq, lenq
    neg     lenq
.loop:
%if cpuflag(sse2)
    cvtdq2ps  m1, [srcq+lenq   ]
    cvtdq2ps  m2, [srcq+lenq+16]
%else
    cvtpi2ps  m1, [srcq+lenq   ]
    cvtpi2ps  m3, [srcq+lenq+ 8]
    cvtpi2ps  m2, [srcq+lenq+16]
    cvtpi2ps  m4, [srcq+lenq+24]
    movlhps   m1, m3
    movlhps   m2, m4
%endif
    mulps     m1, m0
    mulps     m2, m0
    mova  [dstq+lenq   ], m1
    mova  [dstq+lenq+16], m2
    add     lenq, 32
    jl .loop
%if notcpuflag(sse2)
    ;; cvtpi2ps switches to MMX even if the source is a memory location
    ;; possible an error in documentation since every tested CPU disagrees with
    ;; that. Use emms anyway since the vast majority of machines will use the
    ;; SSE2 variant
    emms
%endif
    RET
%endmacro

INIT_XMM sse
INT32_TO_FLOAT_FMUL_SCALAR 5
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_SCALAR 3

;------------------------------------------------------------------------------
; void ff_int32_to_float_fmul_array8(FmtConvertContext *c, float *dst, const int32_t *src,
;                                    const float *mul, int len);
;------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_ARRAY8 0
cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len
    shl     lend, 2
    add     srcq, lenq
    add     dstq, lenq
    neg     lenq
.loop:
    movss     m0, [mulq]
    SPLATD    m0
%if cpuflag(sse2)
    cvtdq2ps  m1, [srcq+lenq   ]
    cvtdq2ps  m2, [srcq+lenq+16]
%else
    cvtpi2ps  m1, [srcq+lenq   ]
    cvtpi2ps  m3, [srcq+lenq+ 8]
    cvtpi2ps  m2, [srcq+lenq+16]
    cvtpi2ps  m4, [srcq+lenq+24]
    movlhps   m1, m3
    movlhps   m2, m4
%endif
    mulps     m1, m0
    mulps     m2, m0
    mova  [dstq+lenq   ], m1
    mova  [dstq+lenq+16], m2
    add     mulq, 4
    add     lenq, 32
    jl .loop
%if notcpuflag(sse2)
    ;; cvtpi2ps switches to MMX even if the source is a memory location
    ;; possible an error in documentation since every tested CPU disagrees with
    ;; that. Use emms anyway since the vast majority of machines will use the
    ;; SSE2 variant
    emms
%endif
    RET
%endmacro

INIT_XMM sse
INT32_TO_FLOAT_FMUL_ARRAY8
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_ARRAY8
Separate format conversion DSP functions from DSPContext. This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae) 2011-01-30 16:06:46 +01:00			`;******************************************************************************`
			`;* x86 optimized Format Conversion Utils`
			`;* Copyright (c) 2008 Loren Merritt`
			`;*`
			`;* This file is part of FFmpeg.`
			`;*`
			`;* FFmpeg is free software; you can redistribute it and/or`
			`;* modify it under the terms of the GNU Lesser General Public`
			`;* License as published by the Free Software Foundation; either`
			`;* version 2.1 of the License, or (at your option) any later version.`
			`;*`
			`;* FFmpeg is distributed in the hope that it will be useful,`
			`;* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`;* Lesser General Public License for more details.`
			`;*`
			`;* You should have received a copy of the GNU Lesser General Public`
			`;* License along with FFmpeg; if not, write to the Free Software`
Fix FSF address copy paste error in some license headers. 2011-05-14 21:32:31 +02:00			`;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA`
Separate format conversion DSP functions from DSPContext. This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae) 2011-01-30 16:06:46 +01:00			`;******************************************************************************`

Move x264asm to libavutil. Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2011-06-05 16:19:16 +02:00			`%include "libavutil/x86/x86util.asm"`
Separate format conversion DSP functions from DSPContext. This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae) 2011-01-30 16:06:46 +01:00
x86inc: Drop SECTION_TEXT macro The .text section is already 16-byte aligned by default on all supported platforms so `SECTION_TEXT` isn't any different from `SECTION .text`. 2015-05-27 21:38:14 +02:00			`SECTION .text`
Separate format conversion DSP functions from DSPContext. This will be beneficial for use with the audio conversion API without requiring it to depend on all of dsputil. Signed-off-by: Mans Rullgard <mans@mansr.com> (cherry picked from commit c73d99e672329c8f2df290736ffc474c360ac4ae) 2011-01-30 16:06:46 +01:00
x86: Make function prototype comments in assembly code consistent This helps grepping for functions, among other things. 2014-01-28 20:35:58 +01:00			`;------------------------------------------------------------------------------`
			`; void ff_int32_to_float_fmul_scalar(float dst, const int32_t src, float mul,`
			`; int len);`
			`;------------------------------------------------------------------------------`
x86: fmtconvert: port to cpuflags 2012-07-15 15:42:17 +02:00			`%macro INT32_TO_FLOAT_FMUL_SCALAR 1`
config.asm: change %ifdef directives to %if directives. This allows combining multiple conditionals in a single statement. 2012-01-23 10:45:58 +01:00			`%if UNIX64`
x86: fmtconvert: port to cpuflags 2012-07-15 15:42:17 +02:00			`cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 05:52:03 +02:00			`%else`
x86: fmtconvert: port to cpuflags 2012-07-15 15:42:17 +02:00			`cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len`
fmtconvert: fix int32_to_float_fmul_scalar() for windows x86_64 The calling convention only allows 4 non-stack parameter, with each float or int register being skipped if not used. fixes Bug 64 2011-11-01 21:57:41 +01:00			`%endif`
config.asm: change %ifdef directives to %if directives. This allows combining multiple conditionals in a single statement. 2012-01-23 10:45:58 +01:00			`%if WIN64`
fmtconvert: fix int32_to_float_fmul_scalar() for windows x86_64 The calling convention only allows 4 non-stack parameter, with each float or int register being skipped if not used. fixes Bug 64 2011-11-01 21:57:41 +01:00			`SWAP 0, 2`
config.asm: change %ifdef directives to %if directives. This allows combining multiple conditionals in a single statement. 2012-01-23 10:45:58 +01:00			`%elif ARCH_X86_32`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 05:52:03 +02:00			`movss m0, mulm`
			`%endif`
			`SPLATD m0`
x86: zero extend the 32-bit length in int32_to_float_fmul_scalar implicitly This reverts commit 5dfe4edad63971d669ae456b0bc40ef9364cca80. 2015-12-22 22:45:42 +01:00			`shl lend, 2`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 05:52:03 +02:00			`add srcq, lenq`
			`add dstq, lenq`
			`neg lenq`
			`.loop:`
x86: fmtconvert: port to cpuflags 2012-07-15 15:42:17 +02:00			`%if cpuflag(sse2)`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 05:52:03 +02:00			`cvtdq2ps m1, [srcq+lenq ]`
			`cvtdq2ps m2, [srcq+lenq+16]`
			`%else`
			`cvtpi2ps m1, [srcq+lenq ]`
			`cvtpi2ps m3, [srcq+lenq+ 8]`
			`cvtpi2ps m2, [srcq+lenq+16]`
			`cvtpi2ps m4, [srcq+lenq+24]`
			`movlhps m1, m3`
			`movlhps m2, m4`
			`%endif`
			`mulps m1, m0`
			`mulps m2, m0`
			`mova [dstq+lenq ], m1`
			`mova [dstq+lenq+16], m2`
			`add lenq, 32`
			`jl .loop`
x86: use emms after ff_int32_to_float_fmul_scalar_sse Intel's Instruction Set Reference (as of September 2015) clearly states that cvtpi2ps switches to MMX state. Actual CPUs do not switch if the source is a memory location. The Instruction Set Reference from 1999 (Order Number 243191) describes this behaviour but all later versions I've seen have make no distinction whether MMX registers or memory is used as source. The documentation for the matching SSE2 instruction to convert to double (cvtpi2pd) was fixed (see the valgrind bug https://bugs.kde.org/show_bug.cgi?id=210264). It will take time to get a clarification and fixes in place. In the meantime it makes sense to change ff_int32_to_float_fmul_scalar_sse to be correct according to the documentation. The vast majority of users will have SSE2 so a change to the SSE version has little effect. Fixes fate-checkasm on x86 valgrind targets. Valgrind 'bug' reported as https://bugs.kde.org/show_bug.cgi?id=357059 2015-12-29 12:08:38 +01:00			`%if notcpuflag(sse2)`
			`;; cvtpi2ps switches to MMX even if the source is a memory location`
			`;; possible an error in documentation since every tested CPU disagrees with`
			`;; that. Use emms anyway since the vast majority of machines will use the`
			`;; SSE2 variant`
			`emms`
			`%endif`
			`RET`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 05:52:03 +02:00			`%endmacro`

x86: fmtconvert: port to cpuflags 2012-07-15 15:42:17 +02:00			`INIT_XMM sse`
			`INT32_TO_FLOAT_FMUL_SCALAR 5`
			`INIT_XMM sse2`
			`INT32_TO_FLOAT_FMUL_SCALAR 3`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 05:52:03 +02:00
x86/fmtconvert: add ff_int32_to_float_fmul_array8_{sse,sse2} About two times faster than the c wrapper. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 2014-09-27 01:46:41 +02:00			`;------------------------------------------------------------------------------`
			`; void ff_int32_to_float_fmul_array8(FmtConvertContext c, float dst, const int32_t *src,`
			`; const float *mul, int len);`
			`;------------------------------------------------------------------------------`
			`%macro INT32_TO_FLOAT_FMUL_ARRAY8 0`
			`cglobal int32_to_float_fmul_array8, 5, 5, 5, c, dst, src, mul, len`
avcodec/x86/fmtconvert: Fix operand size in ff_int32_to_float_fmul_array8_sse* Fixes acodec-dca2 fate failure Signed-off-by: Michael Niedermayer <michaelni@gmx.at> 2014-09-28 18:56:54 +02:00			`shl lend, 2`
x86/fmtconvert: add ff_int32_to_float_fmul_array8_{sse,sse2} About two times faster than the c wrapper. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 2014-09-27 01:46:41 +02:00			`add srcq, lenq`
			`add dstq, lenq`
			`neg lenq`
			`.loop:`
			`movss m0, [mulq]`
			`SPLATD m0`
			`%if cpuflag(sse2)`
			`cvtdq2ps m1, [srcq+lenq ]`
			`cvtdq2ps m2, [srcq+lenq+16]`
			`%else`
			`cvtpi2ps m1, [srcq+lenq ]`
			`cvtpi2ps m3, [srcq+lenq+ 8]`
			`cvtpi2ps m2, [srcq+lenq+16]`
			`cvtpi2ps m4, [srcq+lenq+24]`
			`movlhps m1, m3`
			`movlhps m2, m4`
			`%endif`
			`mulps m1, m0`
			`mulps m2, m0`
			`mova [dstq+lenq ], m1`
			`mova [dstq+lenq+16], m2`
			`add mulq, 4`
			`add lenq, 32`
			`jl .loop`
avcodec/x86/fmtconvert: Add emms to int32_to_float_fmul_array8_sse() this should fix checkasm on x86_64-archlinux-gcc-valgrind Signed-off-by: Michael Niedermayer <michael@niedermayer.cc> 2016-01-15 17:08:37 +01:00			`%if notcpuflag(sse2)`
			`;; cvtpi2ps switches to MMX even if the source is a memory location`
			`;; possible an error in documentation since every tested CPU disagrees with`
			`;; that. Use emms anyway since the vast majority of machines will use the`
			`;; SSE2 variant`
			`emms`
			`%endif`
			`RET`
x86/fmtconvert: add ff_int32_to_float_fmul_array8_{sse,sse2} About two times faster than the c wrapper. Reviewed-by: Michael Niedermayer <michaelni@gmx.at> Signed-off-by: James Almer <jamrial@gmail.com> 2014-09-27 01:46:41 +02:00			`%endmacro`

			`INIT_XMM sse`
			`INT32_TO_FLOAT_FMUL_ARRAY8`
			`INIT_XMM sse2`
			`INT32_TO_FLOAT_FMUL_ARRAY8`
fmtconvert: port int32_to_float_fmul_scalar() x86 inline asm to yasm 2011-10-10 05:52:03 +02:00