Merge commit '8438b3f09f6b225d0886cc385117c38eb44ca0c1'
* commit '8438b3f09f6b225d0886cc385117c38eb44ca0c1': aarch64: h264 idct NEON assembler optimizations Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
commit
fb1c786a9d
@ -1,5 +1,7 @@
|
||||
OBJS-$(CONFIG_H264CHROMA) += aarch64/h264chroma_init_aarch64.o
|
||||
OBJS-$(CONFIG_H264DSP) += aarch64/h264dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_RV40_DECODER) += aarch64/rv40dsp_init_aarch64.o
|
||||
OBJS-$(CONFIG_VC1_DECODER) += aarch64/vc1dsp_init_aarch64.o
|
||||
|
||||
NEON-OBJS-$(CONFIG_H264CHROMA) += aarch64/h264cmc_neon.o
|
||||
NEON-OBJS-$(CONFIG_H264DSP) += aarch64/h264idct_neon.o
|
||||
|
62
libavcodec/aarch64/h264dsp_init_aarch64.c
Normal file
62
libavcodec/aarch64/h264dsp_init_aarch64.c
Normal file
@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "libavutil/attributes.h"
|
||||
#include "libavutil/cpu.h"
|
||||
#include "libavutil/aarch64/cpu.h"
|
||||
#include "libavcodec/h264dsp.h"
|
||||
|
||||
void ff_h264_idct_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add16intra_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
void ff_h264_idct_add8_neon(uint8_t **dest, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
|
||||
void ff_h264_idct8_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct8_dc_add_neon(uint8_t *dst, int16_t *block, int stride);
|
||||
void ff_h264_idct8_add4_neon(uint8_t *dst, const int *block_offset,
|
||||
int16_t *block, int stride,
|
||||
const uint8_t nnzc[6*8]);
|
||||
|
||||
av_cold void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags) && bit_depth == 8) {
|
||||
c->h264_idct_add = ff_h264_idct_add_neon;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
|
||||
c->h264_idct_add16 = ff_h264_idct_add16_neon;
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_neon;
|
||||
if (chroma_format_idc <= 1)
|
||||
c->h264_idct_add8 = ff_h264_idct_add8_neon;
|
||||
c->h264_idct8_add = ff_h264_idct8_add_neon;
|
||||
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_neon;
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_neon;
|
||||
}
|
||||
}
|
408
libavcodec/aarch64/h264idct_neon.S
Normal file
408
libavcodec/aarch64/h264idct_neon.S
Normal file
@ -0,0 +1,408 @@
|
||||
/*
|
||||
* Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
|
||||
* Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
#include "neon.S"
|
||||
|
||||
function ff_h264_idct_add_neon, export=1
|
||||
ld1 {v0.4H, v1.4H, v2.4H, v3.4H}, [x1]
|
||||
sxtw x2, w2
|
||||
movi v30.8H, #0
|
||||
|
||||
add v4.4H, v0.4H, v2.4H
|
||||
sshr v16.4H, v1.4H, #1
|
||||
st1 {v30.8H}, [x1], #16
|
||||
sshr v17.4H, v3.4H, #1
|
||||
st1 {v30.8H}, [x1], #16
|
||||
sub v5.4H, v0.4H, v2.4H
|
||||
add v6.4H, v1.4H, v17.4H
|
||||
sub v7.4H, v16.4H, v3.4H
|
||||
add v0.4H, v4.4H, v6.4H
|
||||
add v1.4H, v5.4H, v7.4H
|
||||
sub v2.4H, v4.4H, v6.4H
|
||||
sub v3.4H, v5.4H, v7.4H
|
||||
|
||||
transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7
|
||||
|
||||
add v4.4H, v0.4H, v3.4H
|
||||
ld1 {v18.S}[0], [x0], x2
|
||||
sshr v16.4H, v2.4H, #1
|
||||
sshr v17.4H, v1.4H, #1
|
||||
ld1 {v19.S}[1], [x0], x2
|
||||
sub v5.4H, v0.4H, v3.4H
|
||||
ld1 {v18.S}[1], [x0], x2
|
||||
add v6.4H, v16.4H, v1.4H
|
||||
ins v4.D[1], v5.D[0]
|
||||
sub v7.4H, v2.4H, v17.4H
|
||||
ld1 {v19.S}[0], [x0], x2
|
||||
ins v6.D[1], v7.D[0]
|
||||
sub x0, x0, x2, lsl #2
|
||||
add v0.8H, v4.8H, v6.8H
|
||||
sub v1.8H, v4.8H, v6.8H
|
||||
|
||||
srshr v0.8H, v0.8H, #6
|
||||
srshr v1.8H, v1.8H, #6
|
||||
|
||||
uaddw v0.8H, v0.8H, v18.8B
|
||||
uaddw v1.8H, v1.8H, v19.8B
|
||||
|
||||
sqxtun v0.8B, v0.8H
|
||||
sqxtun v1.8B, v1.8H
|
||||
|
||||
st1 {v0.S}[0], [x0], x2
|
||||
st1 {v1.S}[1], [x0], x2
|
||||
st1 {v0.S}[1], [x0], x2
|
||||
st1 {v1.S}[0], [x0], x2
|
||||
|
||||
sub x1, x1, #32
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_dc_add_neon, export=1
|
||||
sxtw x2, w2
|
||||
mov w3, #0
|
||||
ld1r {v2.8H}, [x1]
|
||||
strh w3, [x1]
|
||||
srshr v2.8H, v2.8H, #6
|
||||
ld1 {v0.S}[0], [x0], x2
|
||||
ld1 {v0.S}[1], [x0], x2
|
||||
uaddw v3.8H, v2.8H, v0.8B
|
||||
ld1 {v1.S}[0], [x0], x2
|
||||
ld1 {v1.S}[1], [x0], x2
|
||||
uaddw v4.8H, v2.8H, v1.8B
|
||||
sqxtun v0.8B, v3.8H
|
||||
sqxtun v1.8B, v4.8H
|
||||
sub x0, x0, x2, lsl #2
|
||||
st1 {v0.S}[0], [x0], x2
|
||||
st1 {v0.S}[1], [x0], x2
|
||||
st1 {v1.S}[0], [x0], x2
|
||||
st1 {v1.S}[1], [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add16_neon, export=1
|
||||
mov x12, x30
|
||||
mov x6, x0 // dest
|
||||
mov x5, x1 // block_offset
|
||||
mov x1, x2 // block
|
||||
mov w9, w3 // stride
|
||||
movrel x7, scan8
|
||||
mov x10, #16
|
||||
movrel x13, ff_h264_idct_dc_add_neon
|
||||
movrel x14, ff_h264_idct_add_neon
|
||||
1: mov w2, w9
|
||||
ldrb w3, [x7], #1
|
||||
ldrsw x0, [x5], #4
|
||||
ldrb w3, [x4, w3, uxtw]
|
||||
subs w3, w3, #1
|
||||
b.lt 2f
|
||||
ldrsh w3, [x1]
|
||||
add x0, x0, x6
|
||||
ccmp w3, #0, #4, eq
|
||||
csel x15, x13, x14, ne
|
||||
blr x15
|
||||
2: subs x10, x10, #1
|
||||
add x1, x1, #32
|
||||
b.ne 1b
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add16intra_neon, export=1
|
||||
mov x12, x30
|
||||
mov x6, x0 // dest
|
||||
mov x5, x1 // block_offset
|
||||
mov x1, x2 // block
|
||||
mov w9, w3 // stride
|
||||
movrel x7, scan8
|
||||
mov x10, #16
|
||||
movrel x13, ff_h264_idct_dc_add_neon
|
||||
movrel x14, ff_h264_idct_add_neon
|
||||
1: mov w2, w9
|
||||
ldrb w3, [x7], #1
|
||||
ldrsw x0, [x5], #4
|
||||
ldrb w3, [x4, w3, uxtw]
|
||||
add x0, x0, x6
|
||||
cmp w3, #0
|
||||
ldrsh w3, [x1]
|
||||
csel x15, x13, x14, eq
|
||||
ccmp w3, #0, #0, eq
|
||||
b.eq 2f
|
||||
blr x15
|
||||
2: subs x10, x10, #1
|
||||
add x1, x1, #32
|
||||
b.ne 1b
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct_add8_neon, export=1
|
||||
sub sp, sp, #0x40
|
||||
stp x19, x20, [sp]
|
||||
mov x12, x30
|
||||
ldp x6, x15, [x0] // dest[0], dest[1]
|
||||
add x5, x1, #16*4 // block_offset
|
||||
add x9, x2, #16*32 // block
|
||||
mov w19, w3 // stride
|
||||
movrel x13, ff_h264_idct_dc_add_neon
|
||||
movrel x14, ff_h264_idct_add_neon
|
||||
movrel x7, scan8+16
|
||||
mov x10, #0
|
||||
mov x11, #16
|
||||
1: mov w2, w19
|
||||
ldrb w3, [x7, x10] // scan8[i]
|
||||
ldrsw x0, [x5, x10, lsl #2] // block_offset[i]
|
||||
ldrb w3, [x4, w3, uxtw] // nnzc[ scan8[i] ]
|
||||
add x0, x0, x6 // block_offset[i] + dst[j-1]
|
||||
add x1, x9, x10, lsl #5 // block + i * 16
|
||||
cmp w3, #0
|
||||
ldrsh w3, [x1] // block[i*16]
|
||||
csel x20, x13, x14, eq
|
||||
ccmp w3, #0, #0, eq
|
||||
b.eq 2f
|
||||
blr x20
|
||||
2: add x10, x10, #1
|
||||
cmp x10, #4
|
||||
csel x10, x11, x10, eq // mov x10, #16
|
||||
csel x6, x15, x6, eq
|
||||
cmp x10, #20
|
||||
b.lt 1b
|
||||
ldp x19, x20, [sp]
|
||||
add sp, sp, #0x40
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
.macro idct8x8_cols pass
|
||||
.if \pass == 0
|
||||
va .req v18
|
||||
vb .req v30
|
||||
sshr v18.8H, v26.8H, #1
|
||||
add v16.8H, v24.8H, v28.8H
|
||||
ld1 {v30.8H, v31.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
sub v17.8H, v24.8H, v28.8H
|
||||
sshr v19.8H, v30.8H, #1
|
||||
sub v18.8H, v18.8H, v30.8H
|
||||
add v19.8H, v19.8H, v26.8H
|
||||
.else
|
||||
va .req v30
|
||||
vb .req v18
|
||||
sshr v30.8H, v26.8H, #1
|
||||
sshr v19.8H, v18.8H, #1
|
||||
add v16.8H, v24.8H, v28.8H
|
||||
sub v17.8H, v24.8H, v28.8H
|
||||
sub v30.8H, v30.8H, v18.8H
|
||||
add v19.8H, v19.8H, v26.8H
|
||||
.endif
|
||||
add v26.8H, v17.8H, va.8H
|
||||
sub v28.8H, v17.8H, va.8H
|
||||
add v24.8H, v16.8H, v19.8H
|
||||
sub vb.8H, v16.8H, v19.8H
|
||||
sub v16.8H, v29.8H, v27.8H
|
||||
add v17.8H, v31.8H, v25.8H
|
||||
sub va.8H, v31.8H, v25.8H
|
||||
add v19.8H, v29.8H, v27.8H
|
||||
sub v16.8H, v16.8H, v31.8H
|
||||
sub v17.8H, v17.8H, v27.8H
|
||||
add va.8H, va.8H, v29.8H
|
||||
add v19.8H, v19.8H, v25.8H
|
||||
sshr v25.8H, v25.8H, #1
|
||||
sshr v27.8H, v27.8H, #1
|
||||
sshr v29.8H, v29.8H, #1
|
||||
sshr v31.8H, v31.8H, #1
|
||||
sub v16.8H, v16.8H, v31.8H
|
||||
sub v17.8H, v17.8H, v27.8H
|
||||
add va.8H, va.8H, v29.8H
|
||||
add v19.8H, v19.8H, v25.8H
|
||||
sshr v25.8H, v16.8H, #2
|
||||
sshr v27.8H, v17.8H, #2
|
||||
sshr v29.8H, va.8H, #2
|
||||
sshr v31.8H, v19.8H, #2
|
||||
sub v19.8H, v19.8H, v25.8H
|
||||
sub va.8H, v27.8H, va.8H
|
||||
add v17.8H, v17.8H, v29.8H
|
||||
add v16.8H, v16.8H, v31.8H
|
||||
.if \pass == 0
|
||||
sub v31.8H, v24.8H, v19.8H
|
||||
add v24.8H, v24.8H, v19.8H
|
||||
add v25.8H, v26.8H, v18.8H
|
||||
sub v18.8H, v26.8H, v18.8H
|
||||
add v26.8H, v28.8H, v17.8H
|
||||
add v27.8H, v30.8H, v16.8H
|
||||
sub v29.8H, v28.8H, v17.8H
|
||||
sub v28.8H, v30.8H, v16.8H
|
||||
.else
|
||||
sub v31.8H, v24.8H, v19.8H
|
||||
add v24.8H, v24.8H, v19.8H
|
||||
add v25.8H, v26.8H, v30.8H
|
||||
sub v30.8H, v26.8H, v30.8H
|
||||
add v26.8H, v28.8H, v17.8H
|
||||
sub v29.8H, v28.8H, v17.8H
|
||||
add v27.8H, v18.8H, v16.8H
|
||||
sub v28.8H, v18.8H, v16.8H
|
||||
.endif
|
||||
.unreq va
|
||||
.unreq vb
|
||||
.endm
|
||||
|
||||
function ff_h264_idct8_add_neon, export=1
|
||||
movi v19.8H, #0
|
||||
ld1 {v24.8H, v25.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
ld1 {v26.8H, v27.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
ld1 {v28.8H, v29.8H}, [x1]
|
||||
st1 {v19.8H}, [x1], #16
|
||||
st1 {v19.8H}, [x1], #16
|
||||
|
||||
idct8x8_cols 0
|
||||
transpose_8x8H v24, v25, v26, v27, v28, v29, v18, v31, v6, v7
|
||||
idct8x8_cols 1
|
||||
|
||||
mov x3, x0
|
||||
srshr v24.8H, v24.8H, #6
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
srshr v25.8H, v25.8H, #6
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
srshr v26.8H, v26.8H, #6
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
srshr v27.8H, v27.8H, #6
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
srshr v28.8H, v28.8H, #6
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
srshr v29.8H, v29.8H, #6
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
srshr v30.8H, v30.8H, #6
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
srshr v31.8H, v31.8H, #6
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
uaddw v24.8H, v24.8H, v0.8B
|
||||
uaddw v25.8H, v25.8H, v1.8B
|
||||
uaddw v26.8H, v26.8H, v2.8B
|
||||
sqxtun v0.8B, v24.8H
|
||||
uaddw v27.8H, v27.8H, v3.8B
|
||||
sqxtun v1.8B, v25.8H
|
||||
uaddw v28.8H, v28.8H, v4.8B
|
||||
sqxtun v2.8B, v26.8H
|
||||
st1 {v0.8B}, [x3], x2
|
||||
uaddw v29.8H, v29.8H, v5.8B
|
||||
sqxtun v3.8B, v27.8H
|
||||
st1 {v1.8B}, [x3], x2
|
||||
uaddw v30.8H, v30.8H, v6.8B
|
||||
sqxtun v4.8B, v28.8H
|
||||
st1 {v2.8B}, [x3], x2
|
||||
uaddw v31.8H, v31.8H, v7.8B
|
||||
sqxtun v5.8B, v29.8H
|
||||
st1 {v3.8B}, [x3], x2
|
||||
sqxtun v6.8B, v30.8H
|
||||
sqxtun v7.8B, v31.8H
|
||||
st1 {v4.8B}, [x3], x2
|
||||
st1 {v5.8B}, [x3], x2
|
||||
st1 {v6.8B}, [x3], x2
|
||||
st1 {v7.8B}, [x3], x2
|
||||
|
||||
sub x1, x1, #128
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct8_dc_add_neon, export=1
|
||||
mov w3, #0
|
||||
sxtw x2, w2
|
||||
ld1r {v31.8H}, [x1]
|
||||
strh w3, [x1]
|
||||
ld1 {v0.8B}, [x0], x2
|
||||
srshr v31.8H, v31.8H, #6
|
||||
ld1 {v1.8B}, [x0], x2
|
||||
ld1 {v2.8B}, [x0], x2
|
||||
uaddw v24.8H, v31.8H, v0.8B
|
||||
ld1 {v3.8B}, [x0], x2
|
||||
uaddw v25.8H, v31.8H, v1.8B
|
||||
ld1 {v4.8B}, [x0], x2
|
||||
uaddw v26.8H, v31.8H, v2.8B
|
||||
ld1 {v5.8B}, [x0], x2
|
||||
uaddw v27.8H, v31.8H, v3.8B
|
||||
ld1 {v6.8B}, [x0], x2
|
||||
uaddw v28.8H, v31.8H, v4.8B
|
||||
ld1 {v7.8B}, [x0], x2
|
||||
uaddw v29.8H, v31.8H, v5.8B
|
||||
uaddw v30.8H, v31.8H, v6.8B
|
||||
uaddw v31.8H, v31.8H, v7.8B
|
||||
sqxtun v0.8B, v24.8H
|
||||
sqxtun v1.8B, v25.8H
|
||||
sqxtun v2.8B, v26.8H
|
||||
sqxtun v3.8B, v27.8H
|
||||
sub x0, x0, x2, lsl #3
|
||||
st1 {v0.8B}, [x0], x2
|
||||
sqxtun v4.8B, v28.8H
|
||||
st1 {v1.8B}, [x0], x2
|
||||
sqxtun v5.8B, v29.8H
|
||||
st1 {v2.8B}, [x0], x2
|
||||
sqxtun v6.8B, v30.8H
|
||||
st1 {v3.8B}, [x0], x2
|
||||
sqxtun v7.8B, v31.8H
|
||||
st1 {v4.8B}, [x0], x2
|
||||
st1 {v5.8B}, [x0], x2
|
||||
st1 {v6.8B}, [x0], x2
|
||||
st1 {v7.8B}, [x0], x2
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_h264_idct8_add4_neon, export=1
|
||||
mov x12, x30
|
||||
mov x6, x0
|
||||
mov x5, x1
|
||||
mov x1, x2
|
||||
mov w2, w3
|
||||
movrel x7, scan8
|
||||
mov w10, #16
|
||||
movrel x13, ff_h264_idct8_dc_add_neon
|
||||
movrel x14, ff_h264_idct8_add_neon
|
||||
1: ldrb w9, [x7], #4
|
||||
ldrsw x0, [x5], #16
|
||||
ldrb w9, [x4, w9, UXTW]
|
||||
subs w9, w9, #1
|
||||
b.lt 2f
|
||||
ldrsh w11, [x1]
|
||||
add x0, x6, x0
|
||||
ccmp w11, #0, #4, eq
|
||||
csel x15, x13, x14, ne
|
||||
blr x15
|
||||
2: subs w10, w10, #4
|
||||
add x1, x1, #128
|
||||
b.ne 1b
|
||||
ret x12
|
||||
endfunc
|
||||
|
||||
const scan8
|
||||
.byte 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
|
||||
.byte 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
|
||||
.byte 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
|
||||
.byte 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
|
||||
.byte 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
|
||||
.byte 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
|
||||
.byte 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
|
||||
.byte 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
|
||||
.byte 4+11*8, 5+11*8, 4+12*8, 5+12*8
|
||||
.byte 6+11*8, 7+11*8, 6+12*8, 7+12*8
|
||||
.byte 4+13*8, 5+13*8, 4+14*8, 5+14*8
|
||||
.byte 6+13*8, 7+13*8, 6+14*8, 7+14*8
|
||||
endconst
|
61
libavcodec/aarch64/neon.S
Normal file
61
libavcodec/aarch64/neon.S
Normal file
@ -0,0 +1,61 @@
|
||||
/*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
.macro transpose_4x4H r0, r1, r2, r3, r4, r5, r6, r7
|
||||
trn1 \r4\().4H, \r0\().4H, \r1\().4H
|
||||
trn2 \r5\().4H, \r0\().4H, \r1\().4H
|
||||
trn1 \r7\().4H, \r3\().4H, \r2\().4H
|
||||
trn2 \r6\().4H, \r3\().4H, \r2\().4H
|
||||
trn1 \r0\().2S, \r4\().2S, \r7\().2S
|
||||
trn2 \r3\().2S, \r4\().2S, \r7\().2S
|
||||
trn1 \r1\().2S, \r5\().2S, \r6\().2S
|
||||
trn2 \r2\().2S, \r5\().2S, \r6\().2S
|
||||
.endm
|
||||
|
||||
.macro transpose_8x8H r0, r1, r2, r3, r4, r5, r6, r7, r8, r9
|
||||
trn1 \r8\().8H, \r0\().8H, \r1\().8H
|
||||
trn2 \r9\().8H, \r0\().8H, \r1\().8H
|
||||
trn1 \r1\().8H, \r2\().8H, \r3\().8H
|
||||
trn2 \r3\().8H, \r2\().8H, \r3\().8H
|
||||
trn1 \r0\().8H, \r4\().8H, \r5\().8H
|
||||
trn2 \r5\().8H, \r4\().8H, \r5\().8H
|
||||
trn1 \r2\().8H, \r6\().8H, \r7\().8H
|
||||
trn2 \r7\().8H, \r6\().8H, \r7\().8H
|
||||
|
||||
trn1 \r4\().4S, \r0\().4S, \r2\().4S
|
||||
trn2 \r2\().4S, \r0\().4S, \r2\().4S
|
||||
trn1 \r6\().4S, \r5\().4S, \r7\().4S
|
||||
trn2 \r7\().4S, \r5\().4S, \r7\().4S
|
||||
trn1 \r5\().4S, \r9\().4S, \r3\().4S
|
||||
trn2 \r9\().4S, \r9\().4S, \r3\().4S
|
||||
trn1 \r3\().4S, \r8\().4S, \r1\().4S
|
||||
trn2 \r8\().4S, \r8\().4S, \r1\().4S
|
||||
|
||||
trn1 \r0\().2D, \r3\().2D, \r4\().2D
|
||||
trn2 \r4\().2D, \r3\().2D, \r4\().2D
|
||||
|
||||
trn1 \r1\().2D, \r5\().2D, \r6\().2D
|
||||
trn2 \r5\().2D, \r5\().2D, \r6\().2D
|
||||
|
||||
trn2 \r6\().2D, \r8\().2D, \r2\().2D
|
||||
trn1 \r2\().2D, \r8\().2D, \r2\().2D
|
||||
|
||||
trn1 \r3\().2D, \r9\().2D, \r7\().2D
|
||||
trn2 \r7\().2D, \r9\().2D, \r7\().2D
|
||||
|
||||
.endm
|
@ -180,6 +180,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
|
||||
}
|
||||
c->h264_find_start_code_candidate = h264_find_start_code_candidate_c;
|
||||
|
||||
if (ARCH_AARCH64) ff_h264dsp_init_aarch64(c, bit_depth, chroma_format_idc);
|
||||
if (ARCH_ARM) ff_h264dsp_init_arm(c, bit_depth, chroma_format_idc);
|
||||
if (ARCH_PPC) ff_h264dsp_init_ppc(c, bit_depth, chroma_format_idc);
|
||||
if (ARCH_X86) ff_h264dsp_init_x86(c, bit_depth, chroma_format_idc);
|
||||
|
@ -118,6 +118,8 @@ typedef struct H264DSPContext {
|
||||
|
||||
void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc);
|
||||
void ff_h264dsp_init_aarch64(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc);
|
||||
void ff_h264dsp_init_arm(H264DSPContext *c, const int bit_depth,
|
||||
const int chroma_format_idc);
|
||||
void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth,
|
||||
|
Loading…
x
Reference in New Issue
Block a user