ffmpeg/libavcodec/x86/hevc_mc.asm

852 lines
18 KiB
NASM
Raw Normal View History

2015-07-24 18:42:08 +02:00
;*****************************************************************************
;* x86-optimized HEVC MC
;* Copyright 2015 Anton Khirnov
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
2015-07-24 18:42:08 +02:00
pw_1023: times 8 dw 1023
cextern hevc_qpel_coeffs
cextern hevc_qpel_coeffs8
cextern hevc_epel_coeffs
cextern hevc_epel_coeffs8
cextern pw_8
cextern pw_16
cextern pw_32
cextern pw_64
SECTION .text
; %1: width
; %2: bit depth
%macro COMMON_DEFS 2
%assign blocksize 8
%assign nb_blocks ((%1 + blocksize - 1) / blocksize)
%define last_block_truncated (blocksize * nb_blocks > %1)
%if %2 > 8
%define LOAD_BLOCK movu
%define LOAD_HALFBLOCK movq
%assign pixelsize 2
%else
%define LOAD_BLOCK movq
%define LOAD_HALFBLOCK movd
%assign pixelsize 1
%endif
%define STORE_BLOCK mova
%define STORE_HALFBLOCK movq
%endmacro
; %1: block index
%macro BLOCK_DEFS 1
%if last_block_truncated && %1 == nb_blocks - 1
%define block_truncated 1
%define LOAD LOAD_HALFBLOCK
%define STORE STORE_HALFBLOCK
%else
%define block_truncated 0
%define LOAD LOAD_BLOCK
%define STORE STORE_BLOCK
%endif
%endmacro
; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
; pixel *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
; %1: block width
; %2: bit depth
; %3: log2 of height unroll
%macro GET_PIXELS 3
cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused
%assign shift 14 - %2
COMMON_DEFS %1, %2
%if pixelsize == 1
pxor m0, m0
%endif
shr heightd, %3
.loop:
%assign i 0
%rep (1 << %3)
%assign j 0
%rep nb_blocks
BLOCK_DEFS j
LOAD m1, [srcq + j * pixelsize * blocksize]
%if pixelsize == 1
punpcklbw m1, m0
%endif
psllw m1, shift
STORE [dstq + j * 2 * blocksize], m1
%assign j (j + 1)
%endrep
add dstq, dststrideq
add srcq, srcstrideq
%assign i (i + 1)
%endrep
dec heightd
jg .loop
RET
%endmacro
INIT_XMM sse2
GET_PIXELS 4, 8, 1
GET_PIXELS 8, 8, 1
GET_PIXELS 12, 8, 3
GET_PIXELS 16, 8, 2
GET_PIXELS 24, 8, 3
GET_PIXELS 32, 8, 3
GET_PIXELS 48, 8, 3
GET_PIXELS 64, 8, 3
GET_PIXELS 4, 10, 1
GET_PIXELS 8, 10, 1
GET_PIXELS 12, 10, 3
GET_PIXELS 16, 10, 2
GET_PIXELS 24, 10, 3
GET_PIXELS 32, 10, 3
GET_PIXELS 48, 10, 3
GET_PIXELS 64, 10, 3
; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
; 8-bit qpel interpolation
; %1: block width
; %2: 0 - horizontal; 1 - vertical
%macro QPEL_8 2
%if %2
%define postfix v
%define mvfrac myq
%define coeffsaddr r5q
%define pixstride srcstrideq
%define pixstride3 r5q
%define src_m3 r6q
%else
%define postfix h
%define mvfrac mxq
%define coeffsaddr r6q
%define pixstride 1
%define pixstride3 3
%define src_m3 (srcq - 3)
%endif
COMMON_DEFS %1, 8
cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my
and mvfrac, 0x3
dec mvfrac
shl mvfrac, 4
lea coeffsaddr, [hevc_qpel_coeffs8]
mova m0, [coeffsaddr + mvfrac]
SPLATW m1, m0, 1
SPLATW m2, m0, 2
SPLATW m3, m0, 3
SPLATW m0, m0, 0
%if %2
lea pixstride3, [srcstrideq + 2 * srcstrideq]
mov src_m3, srcq
sub src_m3, pixstride3
%endif
.loop
%assign i 0
%rep nb_blocks
BLOCK_DEFS i
LOAD m4, [src_m3 + i * blocksize]
LOAD m5, [src_m3 + i * blocksize + 1 * pixstride]
punpcklbw m4, m5
pmaddubsw m4, m0
LOAD m5, [src_m3 + i * blocksize + 2 * pixstride]
LOAD m6, [srcq + i * blocksize]
punpcklbw m5, m6
pmaddubsw m5, m1
paddsw m4, m5
LOAD m5, [srcq + i * blocksize + 1 * pixstride]
LOAD m6, [srcq + i * blocksize + 2 * pixstride]
punpcklbw m5, m6
pmaddubsw m5, m2
paddsw m4, m5
LOAD m5, [srcq + i * blocksize + pixstride3]
LOAD m6, [srcq + i * blocksize + 4 * pixstride]
punpcklbw m5, m6
pmaddubsw m5, m3
paddsw m4, m5
STORE [dstq + i * 2 * blocksize], m4
%assign i (i + 1)
%endrep
add dstq, dststrideq
add srcq, srcstrideq
%if %2
add src_m3, srcstrideq
%endif
dec heightd
jg .loop
RET
%endmacro
INIT_XMM ssse3
QPEL_8 4, 0
QPEL_8 8, 0
QPEL_8 12, 0
QPEL_8 16, 0
QPEL_8 24, 0
QPEL_8 32, 0
QPEL_8 48, 0
QPEL_8 64, 0
QPEL_8 4, 1
QPEL_8 8, 1
QPEL_8 12, 1
QPEL_8 16, 1
QPEL_8 24, 1
QPEL_8 32, 1
QPEL_8 48, 1
QPEL_8 64, 1
; 16-bit qpel interpolation
; %1: block width
; %2: shift applied to the result
; %3: 0 - horizontal; 1 - vertical
%macro QPEL_16 3
%if %3
%define mvfrac myq
%define pixstride srcstrideq
%define pixstride3 sstride3q
%define src_m3 srcm3q
%else
%define mvfrac mxq
%define pixstride 2
%define pixstride3 6
%define src_m3 (srcq - 6)
%endif
COMMON_DEFS %1, 16
and mvfrac, 0x3
dec mvfrac
shl mvfrac, 4
lea coeffsregq, [hevc_qpel_coeffs]
mova m0, [coeffsregq + mvfrac]
pshufd m1, m0, 0x55
pshufd m2, m0, 0xaa
pshufd m3, m0, 0xff
pshufd m0, m0, 0x00
%if %3
lea sstride3q, [srcstrideq + 2 * srcstrideq]
mov srcm3q, srcq
sub srcm3q, sstride3q
%endif
.loop
%assign i 0
%rep nb_blocks
BLOCK_DEFS i
LOAD m4, [src_m3 + i * 2 * blocksize]
LOAD m5, [src_m3 + i * 2 * blocksize + 1 * pixstride]
LOAD m6, [src_m3 + i * 2 * blocksize + 2 * pixstride]
LOAD m7, [srcq + i * 2 * blocksize + 0 * pixstride]
LOAD m8, [srcq + i * 2 * blocksize + 1 * pixstride]
LOAD m9, [srcq + i * 2 * blocksize + 2 * pixstride]
LOAD m10, [srcq + i * 2 * blocksize + pixstride3]
LOAD m11, [srcq + i * 2 * blocksize + 4 * pixstride]
punpcklwd m12, m4, m5
pmaddwd m12, m0
punpcklwd m13, m6, m7
pmaddwd m13, m1
paddd m12, m13
punpcklwd m13, m8, m9
pmaddwd m13, m2
paddd m12, m13
punpcklwd m13, m10, m11
pmaddwd m13, m3
paddd m12, m13
psrad m12, %2
%if block_truncated == 0
punpckhwd m4, m5
pmaddwd m4, m0
punpckhwd m6, m7
pmaddwd m6, m1
paddd m4, m6
punpckhwd m8, m9
pmaddwd m8, m2
paddd m4, m8
punpckhwd m10, m11
pmaddwd m10, m3
paddd m4, m10
psrad m4, %2
%endif
packssdw m12, m4
STORE [dstq + i * 2 * blocksize], m12
%assign i (i + 1)
%endrep
add dstq, dststrideq
add srcq, srcstrideq
%if %3
add srcm3q, srcstrideq
%endif
dec heightd
jg .loop
RET
%endmacro
%if ARCH_X86_64
%macro QPEL_H_10 1
cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg
QPEL_16 %1, 2, 0
%endmacro
INIT_XMM avx
QPEL_H_10 4
QPEL_H_10 8
QPEL_H_10 12
QPEL_H_10 16
QPEL_H_10 24
QPEL_H_10 32
QPEL_H_10 48
QPEL_H_10 64
%macro QPEL_V_10 1
cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
QPEL_16 %1, 2, 1
%endmacro
INIT_XMM avx
QPEL_V_10 4
QPEL_V_10 8
QPEL_V_10 12
QPEL_V_10 16
QPEL_V_10 24
QPEL_V_10 32
QPEL_V_10 48
QPEL_V_10 64
; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
%macro QPEL_HV 1
cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
QPEL_16 %1, 6, 1
%endmacro
INIT_XMM avx
QPEL_HV 4
QPEL_HV 8
QPEL_HV 12
QPEL_HV 16
QPEL_HV 24
QPEL_HV 32
QPEL_HV 48
QPEL_HV 64
%endif ; ARCH_X86_64
; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
; uint8_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
; 8-bit epel interpolation
; %1: block width
; %2: 0 - horizontal; 1 - vertical
%macro EPEL_8 2
%if %2
%define postfix v
%define mvfrac myq
%define coeffsaddr r5q
%define pixstride srcstrideq
%define pixstride3 r5q
%else
%define postfix h
%define mvfrac mxq
%define coeffsaddr r6q
%define pixstride 1
%define pixstride3 3
%endif
COMMON_DEFS %1, 8
cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my
and mvfrac, 0x7
dec mvfrac
shl mvfrac, 4
lea coeffsaddr, [hevc_epel_coeffs8]
movq m0, [coeffsaddr + mvfrac]
SPLATW m1, m0, 1
SPLATW m0, m0, 0
%if %2
lea pixstride3, [srcstrideq + 2 * srcstrideq]
%endif
sub srcq, pixstride
.loop
%assign i 0
%rep nb_blocks
BLOCK_DEFS i
LOAD m2, [srcq + i * blocksize + 0 * pixstride]
LOAD m3, [srcq + i * blocksize + 1 * pixstride]
LOAD m4, [srcq + i * blocksize + 2 * pixstride]
LOAD m5, [srcq + i * blocksize + pixstride3]
punpcklbw m2, m3
punpcklbw m4, m5
pmaddubsw m2, m0
pmaddubsw m4, m1
paddsw m2, m4
STORE [dstq + i * 2 * blocksize], m2
%assign i (i + 1)
%endrep
add dstq, dststrideq
add srcq, srcstrideq
dec heightd
jg .loop
RET
%endmacro
INIT_XMM ssse3
EPEL_8 4, 0
EPEL_8 8, 0
EPEL_8 12, 0
EPEL_8 16, 0
EPEL_8 24, 0
EPEL_8 32, 0
EPEL_8 4, 1
EPEL_8 8, 1
EPEL_8 12, 1
EPEL_8 16, 1
EPEL_8 24, 1
EPEL_8 32, 1
%macro EPEL_16 3
%if %3
%define mvfrac myq
%define pixstride srcstrideq
%define pixstride3 sstride3q
%else
%define mvfrac mxq
%define pixstride 2
%define pixstride3 6
%endif
COMMON_DEFS %1, 16
and mvfrac, 0x7
dec mvfrac
shl mvfrac, 5
lea coeffsregq, [hevc_epel_coeffs]
mova m0, [coeffsregq + mvfrac]
pshufd m1, m0, 0x55
pshufd m0, m0, 0x00
%if %3
lea sstride3q, [srcstrideq + 2 * srcstrideq]
%endif
sub srcq, pixstride
.loop
%assign i 0
%rep nb_blocks
BLOCK_DEFS i
LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride]
LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride]
LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride]
LOAD m5, [srcq + i * 2 * blocksize + pixstride3]
punpcklwd m6, m2, m3
punpcklwd m7, m4, m5
pmaddwd m6, m0
pmaddwd m7, m1
paddd m6, m7
psrad m6, %2
%if block_truncated == 0
punpckhwd m2, m3
punpckhwd m4, m5
pmaddwd m2, m0
pmaddwd m4, m1
paddd m2, m4
psrad m2, %2
%endif
packssdw m6, m2
STORE [dstq + i * 2 * blocksize], m6
%assign i (i + 1)
%endrep
add dstq, dststrideq
add srcq, srcstrideq
dec heightd
jg .loop
RET
%endmacro
%if ARCH_X86_64
%macro EPEL_H_10 1
cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
EPEL_16 %1, 2, 0
%endmacro
INIT_XMM avx
EPEL_H_10 4
EPEL_H_10 8
EPEL_H_10 12
EPEL_H_10 16
EPEL_H_10 24
EPEL_H_10 32
%macro EPEL_V_10 1
cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
EPEL_16 %1, 2, 1
%endmacro
INIT_XMM avx
EPEL_V_10 4
EPEL_V_10 8
EPEL_V_10 12
EPEL_V_10 16
EPEL_V_10 24
EPEL_V_10 32
; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
; int16_t *src, ptrdiff_t srcstride,
; int height, int mx, int my, int *mcbuffer)
%macro EPEL_HV 1
cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
EPEL_16 %1, 6, 1
%endmacro
INIT_XMM avx
EPEL_HV 4
EPEL_HV 8
EPEL_HV 12
EPEL_HV 16
EPEL_HV 24
EPEL_HV 32
%endif ; ARCH_X86_64
; hevc_put_unweighted_pred_<w>_<d>(pixel *dst, ptrdiff_t dststride,
; int16_t *src, ptrdiff_t srcstride,
; int height)
%macro AVG 5
%if %3
%if %4 == 4
movq %5, %2
paddsw %1, %5
%else
paddsw %1, %2
%endif
%endif
%endmacro
; %1: 0 - one source; 1 - two sources
; %2: width
; %3: bit depth
%macro PUT_PRED 3
%if %1
cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height
%else
cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height
%endif
%assign shift 14 + %1 - %3
%assign offset (1 << (shift - 1))
%define offset_data pw_ %+ offset
mova m0, [offset_data]
%if %3 > 8
%define STORE_BLOCK movu
%define STORE_HALF movq
%assign pixel_max ((1 << %3) - 1)
%define pw_pixel_max pw_ %+ pixel_max
pxor m1, m1
mova m2, [pw_pixel_max]
%else
%define STORE_BLOCK movq
%define STORE_HALF movd
%endif
.loop
%assign i 0
%rep (%2 + 7) / 8
%if (i + 1) * 8 > %2
%define LOAD movq
%define STORE STORE_HALF
%else
%define LOAD mova
%define STORE STORE_BLOCK
%endif
LOAD m3, [srcq + 16 * i]
AVG m3, [src2q + 16 * i], %1, %3 - i * 8, m4
paddsw m3, m0
psraw m3, shift
%if %3 == 8
packuswb m3, m3
STORE [dstq + 8 * i], m3
%else
CLIPW m3, m1, m2
STORE [dstq + 16 * i], m3
%endif
%assign i (i + 1)
%endrep
add dstq, dststrideq
add srcq, srcstrideq
%if %1
add src2q, srcstrideq
%endif
dec heightd
jg .loop
RET
%endmacro
INIT_XMM sse2
PUT_PRED 0, 4, 8
PUT_PRED 1, 4, 8
PUT_PRED 0, 8, 8
PUT_PRED 1, 8, 8
PUT_PRED 0, 12, 8
PUT_PRED 1, 12, 8
PUT_PRED 0, 16, 8
PUT_PRED 1, 16, 8
PUT_PRED 0, 24, 8
PUT_PRED 1, 24, 8
PUT_PRED 0, 32, 8
PUT_PRED 1, 32, 8
PUT_PRED 0, 48, 8
PUT_PRED 1, 48, 8
PUT_PRED 0, 64, 8
PUT_PRED 1, 64, 8
PUT_PRED 0, 4, 10
PUT_PRED 1, 4, 10
PUT_PRED 0, 8, 10
PUT_PRED 1, 8, 10
PUT_PRED 0, 12, 10
PUT_PRED 1, 12, 10
PUT_PRED 0, 16, 10
PUT_PRED 1, 16, 10
PUT_PRED 0, 24, 10
PUT_PRED 1, 24, 10
PUT_PRED 0, 32, 10
PUT_PRED 1, 32, 10
PUT_PRED 0, 48, 10
PUT_PRED 1, 48, 10
PUT_PRED 0, 64, 10
PUT_PRED 1, 64, 10
%macro PUT_WEIGHTED_PRED 3
%if %1
cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height
%else
cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height
%endif
and denomd, 0xff
movsx weight0d, weight0w
movsx offset0d, offset0w
%if %1
movsx weight1d, weight1w
movsx offset1d, offset1w
%endif
add denomd, 14 + %1 - %3
movd m0, denomd
%if %3 > 8
%assign pixel_max ((1 << %3) - 1)
%define pw_pixel_max pw_ %+ pixel_max
pxor m4, m4
mova m5, [pw_pixel_max]
shl offset0d, %3 - 8
%if %1
shl offset1d, %3 - 8
%endif
%endif
%if %1
lea offset0d, [offset0d + offset1d + 1]
%else
lea offset0d, [2 * offset0d + 1]
%endif
movd m1, offset0d
SPLATD m1
pslld m1, m0
psrad m1, 1
movd m2, weight0d
SPLATD m2
%if %1
movd m3, weight1d
SPLATD m3
%endif
.loop
%assign i 0
%rep (%2 + 3) / 4
pmovsxwd m6, [src0q + 8 * i]
pmulld m6, m2
%if %1
pmovsxwd m7, [src1q + 8 * i]
pmulld m7, m3
paddd m6, m7
%endif
paddd m6, m1
psrad m6, m0
packssdw m6, m6
%if %3 > 8
CLIPW m6, m4, m5
movq [dstq + 8 * i], m6
%else
packuswb m6, m6
movd [dstq + 4 * i], m6
%endif
%assign i (i + 1)
%endrep
add dstq, dststrideq
add src0q, srcstrideq
%if %1
add src1q, srcstrideq
%endif
dec heightd
jg .loop
RET
%endmacro
%if ARCH_X86_64
INIT_XMM sse4
PUT_WEIGHTED_PRED 0, 4, 8
PUT_WEIGHTED_PRED 1, 4, 8
PUT_WEIGHTED_PRED 0, 8, 8
PUT_WEIGHTED_PRED 1, 8, 8
PUT_WEIGHTED_PRED 0, 12, 8
PUT_WEIGHTED_PRED 1, 12, 8
PUT_WEIGHTED_PRED 0, 16, 8
PUT_WEIGHTED_PRED 1, 16, 8
PUT_WEIGHTED_PRED 0, 24, 8
PUT_WEIGHTED_PRED 1, 24, 8
PUT_WEIGHTED_PRED 0, 32, 8
PUT_WEIGHTED_PRED 1, 32, 8
PUT_WEIGHTED_PRED 0, 48, 8
PUT_WEIGHTED_PRED 1, 48, 8
PUT_WEIGHTED_PRED 0, 64, 8
PUT_WEIGHTED_PRED 1, 64, 8
PUT_WEIGHTED_PRED 0, 4, 10
PUT_WEIGHTED_PRED 1, 4, 10
PUT_WEIGHTED_PRED 0, 8, 10
PUT_WEIGHTED_PRED 1, 8, 10
PUT_WEIGHTED_PRED 0, 12, 10
PUT_WEIGHTED_PRED 1, 12, 10
PUT_WEIGHTED_PRED 0, 16, 10
PUT_WEIGHTED_PRED 1, 16, 10
PUT_WEIGHTED_PRED 0, 24, 10
PUT_WEIGHTED_PRED 1, 24, 10
PUT_WEIGHTED_PRED 0, 32, 10
PUT_WEIGHTED_PRED 1, 32, 10
PUT_WEIGHTED_PRED 0, 48, 10
PUT_WEIGHTED_PRED 1, 48, 10
PUT_WEIGHTED_PRED 0, 64, 10
PUT_WEIGHTED_PRED 1, 64, 10
%endif ; ARCH_X86_64