Better ARM support for mplayer/ffmpeg, ported from atty fork
while playing with some new hardware, I found it's running a forked mplayer -- and it looks like they're following the GPL. The maintainer's page is here: http://atty.jp/?Zaurus/mplayer Unfortunately it's mostly in Japanese, so it's hard to figure out any details. Their code looks quite interesting (at least to those of us w/ ARM CPUs). The patches I've attached are the patches from atty.jp with a couple of modifications by myself: - ported to current CVS - reverted their change of removing SNOW support from ffmpeg - cleaned up their bswap mess - removed DOS-style linebreaks from various files patch by (Bernhard Rosenkraenzer: bero, arklinux org) Originally committed as revision 4311 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
c66a443401
commit
6ad1fa5a49
@ -316,8 +316,11 @@ endif
|
||||
|
||||
# armv4l specific stuff
|
||||
ifeq ($(TARGET_ARCH_ARMV4L),yes)
|
||||
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o
|
||||
ASM_OBJS += armv4l/jrevdct_arm.o armv4l/simple_idct_arm.o armv4l/dsputil_arm_s.o
|
||||
OBJS += armv4l/dsputil_arm.o armv4l/mpegvideo_arm.o
|
||||
ifeq ($(TARGET_IWMMXT),yes)
|
||||
OBJS += armv4l/dsputil_iwmmxt.o armv4l/mpegvideo_iwmmxt.o
|
||||
endif
|
||||
endif
|
||||
|
||||
# sun mediaLib specific stuff
|
||||
@ -327,6 +330,12 @@ OBJS += mlib/dsputil_mlib.o
|
||||
CFLAGS += $(MLIB_INC)
|
||||
endif
|
||||
|
||||
# Intel IPP specific stuff
|
||||
# currently only works when libavcodec is used in mplayer
|
||||
ifeq ($(HAVE_IPP),yes)
|
||||
CFLAGS += $(IPP_INC)
|
||||
endif
|
||||
|
||||
# alpha specific stuff
|
||||
ifeq ($(TARGET_ARCH_ALPHA),yes)
|
||||
OBJS += alpha/dsputil_alpha.o alpha/mpegvideo_alpha.o \
|
||||
|
@ -18,6 +18,13 @@
|
||||
*/
|
||||
|
||||
#include "../dsputil.h"
|
||||
#ifdef HAVE_IPP
|
||||
#include "ipp.h"
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_IWMMXT
|
||||
extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
|
||||
#endif
|
||||
|
||||
extern void j_rev_dct_ARM(DCTELEM *data);
|
||||
extern void simple_idct_ARM(DCTELEM *data);
|
||||
@ -26,6 +33,146 @@ extern void simple_idct_ARM(DCTELEM *data);
|
||||
static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
|
||||
static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
|
||||
|
||||
void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
|
||||
void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
|
||||
void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
|
||||
static void put_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
put_pixels8_x2_arm(block, pixels, line_size, h);
|
||||
put_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
|
||||
}
|
||||
|
||||
static void put_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
put_pixels8_y2_arm(block, pixels, line_size, h);
|
||||
put_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
|
||||
}
|
||||
|
||||
static void put_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
put_pixels8_xy2_arm(block, pixels, line_size, h);
|
||||
put_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
|
||||
}
|
||||
|
||||
static void put_no_rnd_pixels16_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
put_no_rnd_pixels8_x2_arm(block, pixels, line_size, h);
|
||||
put_no_rnd_pixels8_x2_arm(block + 8, pixels + 8, line_size, h);
|
||||
}
|
||||
|
||||
static void put_no_rnd_pixels16_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
put_no_rnd_pixels8_y2_arm(block, pixels, line_size, h);
|
||||
put_no_rnd_pixels8_y2_arm(block + 8, pixels + 8, line_size, h);
|
||||
}
|
||||
|
||||
static void put_no_rnd_pixels16_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
put_no_rnd_pixels8_xy2_arm(block, pixels, line_size, h);
|
||||
put_no_rnd_pixels8_xy2_arm(block + 8, pixels + 8, line_size, h);
|
||||
}
|
||||
|
||||
static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size)
|
||||
{
|
||||
asm volatile (
|
||||
"mov r10, #8 \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
|
||||
/* load dest */
|
||||
"ldr r4, [%1] \n\t"
|
||||
/* block[0] and block[1]*/
|
||||
"ldrsh r5, [%0] \n\t"
|
||||
"ldrsh r7, [%0, #2] \n\t"
|
||||
"and r6, r4, #0xFF \n\t"
|
||||
"and r8, r4, #0xFF00 \n\t"
|
||||
"add r6, r5, r6 \n\t"
|
||||
"add r8, r7, r8, lsr #8 \n\t"
|
||||
"mvn r5, r5 \n\t"
|
||||
"mvn r7, r7 \n\t"
|
||||
"tst r6, #0x100 \n\t"
|
||||
"movne r6, r5, lsr #24 \n\t"
|
||||
"tst r8, #0x100 \n\t"
|
||||
"movne r8, r7, lsr #24 \n\t"
|
||||
"mov r9, r6 \n\t"
|
||||
"ldrsh r5, [%0, #4] \n\t" /* moved form [A] */
|
||||
"orr r9, r9, r8, lsl #8 \n\t"
|
||||
/* block[2] and block[3] */
|
||||
/* [A] */
|
||||
"ldrsh r7, [%0, #6] \n\t"
|
||||
"and r6, r4, #0xFF0000 \n\t"
|
||||
"and r8, r4, #0xFF000000 \n\t"
|
||||
"add r6, r5, r6, lsr #16 \n\t"
|
||||
"add r8, r7, r8, lsr #24 \n\t"
|
||||
"mvn r5, r5 \n\t"
|
||||
"mvn r7, r7 \n\t"
|
||||
"tst r6, #0x100 \n\t"
|
||||
"movne r6, r5, lsr #24 \n\t"
|
||||
"tst r8, #0x100 \n\t"
|
||||
"movne r8, r7, lsr #24 \n\t"
|
||||
"orr r9, r9, r6, lsl #16 \n\t"
|
||||
"ldr r4, [%1, #4] \n\t" /* moved form [B] */
|
||||
"orr r9, r9, r8, lsl #24 \n\t"
|
||||
/* store dest */
|
||||
"ldrsh r5, [%0, #8] \n\t" /* moved form [C] */
|
||||
"str r9, [%1] \n\t"
|
||||
|
||||
/* load dest */
|
||||
/* [B] */
|
||||
/* block[4] and block[5] */
|
||||
/* [C] */
|
||||
"ldrsh r7, [%0, #10] \n\t"
|
||||
"and r6, r4, #0xFF \n\t"
|
||||
"and r8, r4, #0xFF00 \n\t"
|
||||
"add r6, r5, r6 \n\t"
|
||||
"add r8, r7, r8, lsr #8 \n\t"
|
||||
"mvn r5, r5 \n\t"
|
||||
"mvn r7, r7 \n\t"
|
||||
"tst r6, #0x100 \n\t"
|
||||
"movne r6, r5, lsr #24 \n\t"
|
||||
"tst r8, #0x100 \n\t"
|
||||
"movne r8, r7, lsr #24 \n\t"
|
||||
"mov r9, r6 \n\t"
|
||||
"ldrsh r5, [%0, #12] \n\t" /* moved from [D] */
|
||||
"orr r9, r9, r8, lsl #8 \n\t"
|
||||
/* block[6] and block[7] */
|
||||
/* [D] */
|
||||
"ldrsh r7, [%0, #14] \n\t"
|
||||
"and r6, r4, #0xFF0000 \n\t"
|
||||
"and r8, r4, #0xFF000000 \n\t"
|
||||
"add r6, r5, r6, lsr #16 \n\t"
|
||||
"add r8, r7, r8, lsr #24 \n\t"
|
||||
"mvn r5, r5 \n\t"
|
||||
"mvn r7, r7 \n\t"
|
||||
"tst r6, #0x100 \n\t"
|
||||
"movne r6, r5, lsr #24 \n\t"
|
||||
"tst r8, #0x100 \n\t"
|
||||
"movne r8, r7, lsr #24 \n\t"
|
||||
"orr r9, r9, r6, lsl #16 \n\t"
|
||||
"add %0, %0, #16 \n\t" /* moved from [E] */
|
||||
"orr r9, r9, r8, lsl #24 \n\t"
|
||||
"subs r10, r10, #1 \n\t" /* moved from [F] */
|
||||
/* store dest */
|
||||
"str r9, [%1, #4] \n\t"
|
||||
|
||||
/* [E] */
|
||||
/* [F] */
|
||||
"add %1, %1, %2 \n\t"
|
||||
"bne 1b \n\t"
|
||||
:
|
||||
: "r"(block),
|
||||
"r"(dest),
|
||||
"r"(line_size)
|
||||
: "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" );
|
||||
}
|
||||
|
||||
/* XXX: those functions should be suppressed ASAP when all IDCTs are
|
||||
converted */
|
||||
static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
@ -48,6 +195,34 @@ static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
simple_idct_ARM (block);
|
||||
ff_add_pixels_clamped(block, dest, line_size);
|
||||
}
|
||||
static void simple_idct_ipp(DCTELEM *block)
|
||||
{
|
||||
#ifdef HAVE_IPP
|
||||
ippiDCT8x8Inv_Video_16s_C1I(block);
|
||||
#endif
|
||||
}
|
||||
static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
#ifdef HAVE_IPP
|
||||
ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef HAVE_IWMMXT
|
||||
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
|
||||
#endif
|
||||
|
||||
static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
|
||||
{
|
||||
#ifdef HAVE_IPP
|
||||
ippiDCT8x8Inv_Video_16s_C1I(block);
|
||||
#ifdef HAVE_IWMMXT
|
||||
add_pixels_clamped_iwmmxt(block, dest, line_size);
|
||||
#else
|
||||
add_pixels_clamped_ARM(block, dest, line_size);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
@ -56,7 +231,11 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
|
||||
ff_put_pixels_clamped = c->put_pixels_clamped;
|
||||
ff_add_pixels_clamped = c->add_pixels_clamped;
|
||||
|
||||
#ifdef HAVE_IPP
|
||||
if(idct_algo==FF_IDCT_ARM){
|
||||
#else
|
||||
if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_ARM){
|
||||
#endif
|
||||
c->idct_put= j_rev_dct_ARM_put;
|
||||
c->idct_add= j_rev_dct_ARM_add;
|
||||
c->idct = j_rev_dct_ARM;
|
||||
@ -66,5 +245,37 @@ void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
|
||||
c->idct_add= simple_idct_ARM_add;
|
||||
c->idct = simple_idct_ARM;
|
||||
c->idct_permutation_type= FF_NO_IDCT_PERM;
|
||||
#ifdef HAVE_IPP
|
||||
} else if (idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_IPP){
|
||||
#else
|
||||
} else if (idct_algo==FF_IDCT_IPP){
|
||||
#endif
|
||||
c->idct_put= simple_idct_ipp_put;
|
||||
c->idct_add= simple_idct_ipp_add;
|
||||
c->idct = simple_idct_ipp;
|
||||
c->idct_permutation_type= FF_NO_IDCT_PERM;
|
||||
}
|
||||
|
||||
/* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG!
|
||||
c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK!
|
||||
c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK!
|
||||
/* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */
|
||||
/* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; // ?(»È¤ï¤ì¤Ê¤¤) */
|
||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK
|
||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK
|
||||
/* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */
|
||||
c->put_pixels_tab[1][0] = put_pixels8_arm; //OK
|
||||
c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK
|
||||
/* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */
|
||||
/* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */
|
||||
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK
|
||||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK
|
||||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
|
||||
/* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */
|
||||
|
||||
#if 1
|
||||
#ifdef HAVE_IWMMXT
|
||||
dsputil_init_iwmmxt(c, avctx);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
694
libavcodec/armv4l/dsputil_arm_s.S
Normal file
694
libavcodec/armv4l/dsputil_arm_s.S
Normal file
@ -0,0 +1,694 @@
|
||||
@
|
||||
@ ARMv4L optimized DSP utils
|
||||
@ Copyright (c) 2004 AGAWA Koji <i (AT) atty (DOT) jp>
|
||||
@
|
||||
@ This library is free software; you can redistribute it and/or
|
||||
@ modify it under the terms of the GNU Lesser General Public
|
||||
@ License as published by the Free Software Foundation; either
|
||||
@ version 2 of the License, or (at your option) any later version.
|
||||
@
|
||||
@ This library is distributed in the hope that it will be useful,
|
||||
@ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
@ Lesser General Public License for more details.
|
||||
@
|
||||
@ You should have received a copy of the GNU Lesser General Public
|
||||
@ License along with this library; if not, write to the Free Software
|
||||
@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
@
|
||||
|
||||
.macro ADJ_ALIGN_QUADWORD_D shift, Rd0, Rd1, Rd2, Rd3, Rn0, Rn1, Rn2, Rn3, Rn4
|
||||
mov \Rd0, \Rn0, lsr #(\shift * 8)
|
||||
mov \Rd1, \Rn1, lsr #(\shift * 8)
|
||||
mov \Rd2, \Rn2, lsr #(\shift * 8)
|
||||
mov \Rd3, \Rn3, lsr #(\shift * 8)
|
||||
orr \Rd0, \Rd0, \Rn1, lsl #(32 - \shift * 8)
|
||||
orr \Rd1, \Rd1, \Rn2, lsl #(32 - \shift * 8)
|
||||
orr \Rd2, \Rd2, \Rn3, lsl #(32 - \shift * 8)
|
||||
orr \Rd3, \Rd3, \Rn4, lsl #(32 - \shift * 8)
|
||||
.endm
|
||||
.macro ADJ_ALIGN_DOUBLEWORD shift, R0, R1, R2
|
||||
mov \R0, \R0, lsr #(\shift * 8)
|
||||
orr \R0, \R0, \R1, lsl #(32 - \shift * 8)
|
||||
mov \R1, \R1, lsr #(\shift * 8)
|
||||
orr \R1, \R1, \R2, lsl #(32 - \shift * 8)
|
||||
.endm
|
||||
.macro ADJ_ALIGN_DOUBLEWORD_D shift, Rdst0, Rdst1, Rsrc0, Rsrc1, Rsrc2
|
||||
mov \Rdst0, \Rsrc0, lsr #(\shift * 8)
|
||||
mov \Rdst1, \Rsrc1, lsr #(\shift * 8)
|
||||
orr \Rdst0, \Rdst0, \Rsrc1, lsl #(32 - (\shift * 8))
|
||||
orr \Rdst1, \Rdst1, \Rsrc2, lsl #(32 - (\shift * 8))
|
||||
.endm
|
||||
|
||||
.macro RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
|
||||
@ Rd = (Rn | Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
|
||||
@ Rmask = 0xFEFEFEFE
|
||||
@ Rn = destroy
|
||||
eor \Rd0, \Rn0, \Rm0
|
||||
eor \Rd1, \Rn1, \Rm1
|
||||
orr \Rn0, \Rn0, \Rm0
|
||||
orr \Rn1, \Rn1, \Rm1
|
||||
and \Rd0, \Rd0, \Rmask
|
||||
and \Rd1, \Rd1, \Rmask
|
||||
sub \Rd0, \Rn0, \Rd0, lsr #1
|
||||
sub \Rd1, \Rn1, \Rd1, lsr #1
|
||||
.endm
|
||||
|
||||
.macro NO_RND_AVG32 Rd0, Rd1, Rn0, Rn1, Rm0, Rm1, Rmask
|
||||
@ Rd = (Rn & Rm) - (((Rn ^ Rm) & ~0x01010101) >> 1)
|
||||
@ Rmask = 0xFEFEFEFE
|
||||
@ Rn = destroy
|
||||
eor \Rd0, \Rn0, \Rm0
|
||||
eor \Rd1, \Rn1, \Rm1
|
||||
and \Rn0, \Rn0, \Rm0
|
||||
and \Rn1, \Rn1, \Rm1
|
||||
and \Rd0, \Rd0, \Rmask
|
||||
and \Rd1, \Rd1, \Rmask
|
||||
add \Rd0, \Rn0, \Rd0, lsr #1
|
||||
add \Rd1, \Rn1, \Rd1, lsr #1
|
||||
.endm
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
.align 8
|
||||
.global put_pixels16_arm
|
||||
put_pixels16_arm:
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
stmfd sp!, {r4-r11, lr} @ R14 is also called LR
|
||||
adr r5, 5f
|
||||
ands r4, r1, #3
|
||||
bic r1, r1, #3
|
||||
add r5, r5, r4, lsl #2
|
||||
ldrne pc, [r5]
|
||||
1:
|
||||
ldmia r1, {r4-r7}
|
||||
add r1, r1, r2
|
||||
stmia r0, {r4-r7}
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
ldmfd sp!, {r4-r11, pc}
|
||||
.align 8
|
||||
2:
|
||||
ldmia r1, {r4-r8}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_QUADWORD_D 1, r9, r10, r11, r12, r4, r5, r6, r7, r8
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r9-r12}
|
||||
add r0, r0, r2
|
||||
bne 2b
|
||||
ldmfd sp!, {r4-r11, pc}
|
||||
.align 8
|
||||
3:
|
||||
ldmia r1, {r4-r8}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_QUADWORD_D 2, r9, r10, r11, r12, r4, r5, r6, r7, r8
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r9-r12}
|
||||
add r0, r0, r2
|
||||
bne 3b
|
||||
ldmfd sp!, {r4-r11, pc}
|
||||
.align 8
|
||||
4:
|
||||
ldmia r1, {r4-r8}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_QUADWORD_D 3, r9, r10, r11, r12, r4, r5, r6, r7, r8
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r9-r12}
|
||||
add r0, r0, r2
|
||||
bne 4b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
.align 8
|
||||
5:
|
||||
.word 1b
|
||||
.word 2b
|
||||
.word 3b
|
||||
.word 4b
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
.align 8
|
||||
.global put_pixels8_arm
|
||||
put_pixels8_arm:
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
stmfd sp!, {r4-r5,lr} @ R14 is also called LR
|
||||
adr r5, 5f
|
||||
ands r4, r1, #3
|
||||
bic r1, r1, #3
|
||||
add r5, r5, r4, lsl #2
|
||||
ldrne pc, [r5]
|
||||
1:
|
||||
ldmia r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
subs r3, r3, #1
|
||||
pld [r1]
|
||||
stmia r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
ldmfd sp!, {r4-r5,pc}
|
||||
.align 8
|
||||
2:
|
||||
ldmia r1, {r4-r5, r12}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r12
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 2b
|
||||
ldmfd sp!, {r4-r5,pc}
|
||||
.align 8
|
||||
3:
|
||||
ldmia r1, {r4-r5, r12}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r12
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 3b
|
||||
ldmfd sp!, {r4-r5,pc}
|
||||
.align 8
|
||||
4:
|
||||
ldmia r1, {r4-r5, r12}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r12
|
||||
pld [r1]
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 4b
|
||||
ldmfd sp!, {r4-r5,pc}
|
||||
.align 8
|
||||
5:
|
||||
.word 1b
|
||||
.word 2b
|
||||
.word 3b
|
||||
.word 4b
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
.align 8
|
||||
.global put_pixels8_x2_arm
|
||||
put_pixels8_x2_arm:
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
stmfd sp!, {r4-r10,lr} @ R14 is also called LR
|
||||
adr r5, 5f
|
||||
ands r4, r1, #3
|
||||
ldr r12, [r5]
|
||||
add r5, r5, r4, lsl #2
|
||||
bic r1, r1, #3
|
||||
ldrne pc, [r5]
|
||||
1:
|
||||
ldmia r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
|
||||
pld [r1]
|
||||
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
ldmfd sp!, {r4-r10,pc}
|
||||
.align 8
|
||||
2:
|
||||
ldmia r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
|
||||
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
|
||||
pld [r1]
|
||||
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 2b
|
||||
ldmfd sp!, {r4-r10,pc}
|
||||
.align 8
|
||||
3:
|
||||
ldmia r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
|
||||
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
|
||||
pld [r1]
|
||||
RND_AVG32 r4, r5, r6, r7, r8, r9, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 3b
|
||||
ldmfd sp!, {r4-r10,pc}
|
||||
.align 8
|
||||
4:
|
||||
ldmia r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
|
||||
pld [r1]
|
||||
RND_AVG32 r8, r9, r6, r7, r5, r10, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 4b
|
||||
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
|
||||
.align 8
|
||||
5:
|
||||
.word 0xFEFEFEFE
|
||||
.word 2b
|
||||
.word 3b
|
||||
.word 4b
|
||||
|
||||
.align 8
|
||||
.global put_no_rnd_pixels8_x2_arm
|
||||
put_no_rnd_pixels8_x2_arm:
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
stmfd sp!, {r4-r10,lr} @ R14 is also called LR
|
||||
adr r5, 5f
|
||||
ands r4, r1, #3
|
||||
ldr r12, [r5]
|
||||
add r5, r5, r4, lsl #2
|
||||
bic r1, r1, #3
|
||||
ldrne pc, [r5]
|
||||
1:
|
||||
ldmia r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 1b
|
||||
ldmfd sp!, {r4-r10,pc}
|
||||
.align 8
|
||||
2:
|
||||
ldmia r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD_D 1, r6, r7, r4, r5, r10
|
||||
ADJ_ALIGN_DOUBLEWORD_D 2, r8, r9, r4, r5, r10
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 2b
|
||||
ldmfd sp!, {r4-r10,pc}
|
||||
.align 8
|
||||
3:
|
||||
ldmia r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r4, r5, r10
|
||||
ADJ_ALIGN_DOUBLEWORD_D 3, r8, r9, r4, r5, r10
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r4, r5, r6, r7, r8, r9, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 3b
|
||||
ldmfd sp!, {r4-r10,pc}
|
||||
.align 8
|
||||
4:
|
||||
ldmia r1, {r4-r5, r10}
|
||||
add r1, r1, r2
|
||||
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r4, r5, r10
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r8, r9, r6, r7, r5, r10, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 4b
|
||||
ldmfd sp!, {r4-r10,pc} @@ update PC with LR content.
|
||||
.align 8
|
||||
5:
|
||||
.word 0xFEFEFEFE
|
||||
.word 2b
|
||||
.word 3b
|
||||
.word 4b
|
||||
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
.align 8
|
||||
.global put_pixels8_y2_arm
|
||||
put_pixels8_y2_arm:
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
|
||||
adr r5, 5f
|
||||
ands r4, r1, #3
|
||||
mov r3, r3, lsr #1
|
||||
ldr r12, [r5]
|
||||
add r5, r5, r4, lsl #2
|
||||
bic r1, r1, #3
|
||||
ldrne pc, [r5]
|
||||
1:
|
||||
ldmia r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
6: ldmia r1, {r6-r7}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
RND_AVG32 r8, r9, r4, r5, r6, r7, r12
|
||||
ldmia r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
stmia r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
pld [r1]
|
||||
RND_AVG32 r8, r9, r6, r7, r4, r5, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
.align 8
|
||||
2:
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
|
||||
6: ldmia r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
|
||||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
.align 8
|
||||
3:
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
|
||||
6: ldmia r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
|
||||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
.align 8
|
||||
4:
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
|
||||
6: ldmia r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
|
||||
RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
|
||||
.align 8
|
||||
5:
|
||||
.word 0xFEFEFEFE
|
||||
.word 2b
|
||||
.word 3b
|
||||
.word 4b
|
||||
|
||||
.align 8
|
||||
.global put_no_rnd_pixels8_y2_arm
|
||||
put_no_rnd_pixels8_y2_arm:
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
|
||||
adr r5, 5f
|
||||
ands r4, r1, #3
|
||||
mov r3, r3, lsr #1
|
||||
ldr r12, [r5]
|
||||
add r5, r5, r4, lsl #2
|
||||
bic r1, r1, #3
|
||||
ldrne pc, [r5]
|
||||
1:
|
||||
ldmia r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
6: ldmia r1, {r6-r7}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r8, r9, r4, r5, r6, r7, r12
|
||||
ldmia r1, {r4-r5}
|
||||
add r1, r1, r2
|
||||
stmia r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
pld [r1]
|
||||
NO_RND_AVG32 r8, r9, r6, r7, r4, r5, r12
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r8-r9}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
.align 8
|
||||
2:
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
|
||||
6: ldmia r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 1, r7, r8, r9
|
||||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 1, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
.align 8
|
||||
3:
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
|
||||
6: ldmia r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 2, r7, r8, r9
|
||||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 2, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
.align 8
|
||||
4:
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
|
||||
6: ldmia r1, {r7-r9}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 3, r7, r8, r9
|
||||
NO_RND_AVG32 r10, r11, r4, r5, r7, r8, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
ldmia r1, {r4-r6}
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
ADJ_ALIGN_DOUBLEWORD 3, r4, r5, r6
|
||||
subs r3, r3, #1
|
||||
NO_RND_AVG32 r10, r11, r7, r8, r4, r5, r12
|
||||
stmia r0, {r10-r11}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
.align 8
|
||||
5:
|
||||
.word 0xFEFEFEFE
|
||||
.word 2b
|
||||
.word 3b
|
||||
.word 4b
|
||||
|
||||
@ ----------------------------------------------------------------
|
||||
.macro RND_XY2_IT align, rnd
|
||||
@ l1= (a & 0x03030303) + (b & 0x03030303) ?(+ 0x02020202)
|
||||
@ h1= ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2)
|
||||
.if \align == 0
|
||||
ldmia r1, {r6-r8}
|
||||
.elseif \align == 3
|
||||
ldmia r1, {r5-r7}
|
||||
.else
|
||||
ldmia r1, {r8-r10}
|
||||
.endif
|
||||
add r1, r1, r2
|
||||
pld [r1]
|
||||
.if \align == 0
|
||||
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r6, r7, r8
|
||||
.elseif \align == 1
|
||||
ADJ_ALIGN_DOUBLEWORD_D 1, r4, r5, r8, r9, r10
|
||||
ADJ_ALIGN_DOUBLEWORD_D 2, r6, r7, r8, r9, r10
|
||||
.elseif \align == 2
|
||||
ADJ_ALIGN_DOUBLEWORD_D 2, r4, r5, r8, r9, r10
|
||||
ADJ_ALIGN_DOUBLEWORD_D 3, r6, r7, r8, r9, r10
|
||||
.elseif \align == 3
|
||||
ADJ_ALIGN_DOUBLEWORD_D 3, r4, r5, r5, r6, r7
|
||||
.endif
|
||||
ldr r14, [r12, #0] @ 0x03030303
|
||||
tst r3, #1
|
||||
and r8, r4, r14
|
||||
and r9, r5, r14
|
||||
and r10, r6, r14
|
||||
and r11, r7, r14
|
||||
.if \rnd == 1
|
||||
ldreq r14, [r12, #16] @ 0x02020202
|
||||
.else
|
||||
ldreq r14, [r12, #28] @ 0x01010101
|
||||
.endif
|
||||
add r8, r8, r10
|
||||
add r9, r9, r11
|
||||
addeq r8, r8, r14
|
||||
addeq r9, r9, r14
|
||||
ldr r14, [r12, #20] @ 0xFCFCFCFC >> 2
|
||||
and r4, r14, r4, lsr #2
|
||||
and r5, r14, r5, lsr #2
|
||||
and r6, r14, r6, lsr #2
|
||||
and r7, r14, r7, lsr #2
|
||||
add r10, r4, r6
|
||||
add r11, r5, r7
|
||||
.endm
|
||||
|
||||
.macro RND_XY2_EXPAND align, rnd
|
||||
RND_XY2_IT \align, \rnd
|
||||
6: stmfd sp!, {r8-r11}
|
||||
RND_XY2_IT \align, \rnd
|
||||
ldmfd sp!, {r4-r7}
|
||||
add r4, r4, r8
|
||||
add r5, r5, r9
|
||||
add r6, r6, r10
|
||||
add r7, r7, r11
|
||||
ldr r14, [r12, #24] @ 0x0F0F0F0F
|
||||
and r4, r14, r4, lsr #2
|
||||
and r5, r14, r5, lsr #2
|
||||
add r4, r4, r6
|
||||
add r5, r5, r7
|
||||
subs r3, r3, #1
|
||||
stmia r0, {r4-r5}
|
||||
add r0, r0, r2
|
||||
bne 6b
|
||||
ldmfd sp!, {r4-r11,pc}
|
||||
.endm
|
||||
|
||||
.align 8
|
||||
.global put_pixels8_xy2_arm
|
||||
put_pixels8_xy2_arm:
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
|
||||
adrl r12, 5f
|
||||
ands r4, r1, #3
|
||||
add r5, r12, r4, lsl #2
|
||||
bic r1, r1, #3
|
||||
ldrne pc, [r5]
|
||||
1:
|
||||
RND_XY2_EXPAND 0, 1
|
||||
|
||||
.align 8
|
||||
2:
|
||||
RND_XY2_EXPAND 1, 1
|
||||
|
||||
.align 8
|
||||
3:
|
||||
RND_XY2_EXPAND 2, 1
|
||||
|
||||
.align 8
|
||||
4:
|
||||
RND_XY2_EXPAND 3, 1
|
||||
|
||||
5:
|
||||
.word 0x03030303
|
||||
.word 2b
|
||||
.word 3b
|
||||
.word 4b
|
||||
.word 0x02020202
|
||||
.word 0xFCFCFCFC >> 2
|
||||
.word 0x0F0F0F0F
|
||||
.word 0x01010101
|
||||
|
||||
.align 8
|
||||
.global put_no_rnd_pixels8_xy2_arm
|
||||
put_no_rnd_pixels8_xy2_arm:
|
||||
@ void func(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
@ block = word aligned, pixles = unaligned
|
||||
pld [r1]
|
||||
stmfd sp!, {r4-r11,lr} @ R14 is also called LR
|
||||
adrl r12, 5f
|
||||
ands r4, r1, #3
|
||||
add r5, r12, r4, lsl #2
|
||||
bic r1, r1, #3
|
||||
ldrne pc, [r5]
|
||||
1:
|
||||
RND_XY2_EXPAND 0, 0
|
||||
|
||||
.align 8
|
||||
2:
|
||||
RND_XY2_EXPAND 1, 0
|
||||
|
||||
.align 8
|
||||
3:
|
||||
RND_XY2_EXPAND 2, 0
|
||||
|
||||
.align 8
|
||||
4:
|
||||
RND_XY2_EXPAND 3, 0
|
||||
|
||||
5:
|
||||
.word 0x03030303
|
||||
.word 2b
|
||||
.word 3b
|
||||
.word 4b
|
||||
.word 0x02020202
|
||||
.word 0xFCFCFCFC >> 2
|
||||
.word 0x0F0F0F0F
|
||||
.word 0x01010101
|
168
libavcodec/armv4l/dsputil_iwmmxt.c
Normal file
168
libavcodec/armv4l/dsputil_iwmmxt.c
Normal file
@ -0,0 +1,168 @@
|
||||
/*
|
||||
* iWMMXt optimized DSP utils
|
||||
* Copyright (c) 2004 AGAWA Koji
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with this library; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
|
||||
#include "../dsputil.h"
|
||||
|
||||
#define DEF(x, y) x ## _no_rnd_ ## y ##_iwmmxt
|
||||
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #1 \n\t tbcsth " #regd ", r12":::"r12");
|
||||
#define WAVG2B "wavg2b"
|
||||
#include "dsputil_iwmmxt_rnd.h"
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef WAVG2B
|
||||
|
||||
#define DEF(x, y) x ## _ ## y ##_iwmmxt
|
||||
#define SET_RND(regd) __asm__ __volatile__ ("mov r12, #2 \n\t tbcsth " #regd ", r12":::"r12");
|
||||
#define WAVG2B "wavg2br"
|
||||
#include "dsputil_iwmmxt_rnd.h"
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef WAVG2BR
|
||||
|
||||
// need scheduling
|
||||
#define OP(AVG) \
|
||||
asm volatile ( \
|
||||
/* alignment */ \
|
||||
"and r12, %[pixels], #7 \n\t" \
|
||||
"bic %[pixels], %[pixels], #7 \n\t" \
|
||||
"tmcr wcgr1, r12 \n\t" \
|
||||
\
|
||||
"wldrd wr0, [%[pixels]] \n\t" \
|
||||
"wldrd wr1, [%[pixels], #8] \n\t" \
|
||||
"add %[pixels], %[pixels], %[line_size] \n\t" \
|
||||
"walignr1 wr4, wr0, wr1 \n\t" \
|
||||
\
|
||||
"1: \n\t" \
|
||||
\
|
||||
"wldrd wr2, [%[pixels]] \n\t" \
|
||||
"wldrd wr3, [%[pixels], #8] \n\t" \
|
||||
"add %[pixels], %[pixels], %[line_size] \n\t" \
|
||||
"pld [%[pixels]] \n\t" \
|
||||
"walignr1 wr5, wr2, wr3 \n\t" \
|
||||
AVG " wr6, wr4, wr5 \n\t" \
|
||||
"wstrd wr6, [%[block]] \n\t" \
|
||||
"add %[block], %[block], %[line_size] \n\t" \
|
||||
\
|
||||
"wldrd wr0, [%[pixels]] \n\t" \
|
||||
"wldrd wr1, [%[pixels], #8] \n\t" \
|
||||
"add %[pixels], %[pixels], %[line_size] \n\t" \
|
||||
"walignr1 wr4, wr0, wr1 \n\t" \
|
||||
"pld [%[pixels]] \n\t" \
|
||||
AVG " wr6, wr4, wr5 \n\t" \
|
||||
"wstrd wr6, [%[block]] \n\t" \
|
||||
"add %[block], %[block], %[line_size] \n\t" \
|
||||
\
|
||||
"subs %[h], %[h], #2 \n\t" \
|
||||
"bne 1b \n\t" \
|
||||
: [block]"+r"(block), [pixels]"+r"(pixels), [h]"+r"(h) \
|
||||
: [line_size]"r"(line_size) \
|
||||
: "memory", "r12");
|
||||
void put_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
|
||||
{
|
||||
OP("wavg2br");
|
||||
}
|
||||
void put_no_rnd_pixels8_y2_iwmmxt(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
|
||||
{
|
||||
OP("wavg2b");
|
||||
}
|
||||
#undef OP
|
||||
|
||||
void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size)
|
||||
{
|
||||
uint8_t *pixels2 = pixels + line_size;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"mov r12, #4 \n\t"
|
||||
"1: \n\t"
|
||||
"pld [%[pixels], %[line_size2]] \n\t"
|
||||
"pld [%[pixels2], %[line_size2]] \n\t"
|
||||
"wldrd wr4, [%[pixels]] \n\t"
|
||||
"wldrd wr5, [%[pixels2]] \n\t"
|
||||
"pld [%[block], #32] \n\t"
|
||||
"wunpckelub wr6, wr4 \n\t"
|
||||
"wldrd wr0, [%[block]] \n\t"
|
||||
"wunpckehub wr7, wr4 \n\t"
|
||||
"wldrd wr1, [%[block], #8] \n\t"
|
||||
"wunpckelub wr8, wr5 \n\t"
|
||||
"wldrd wr2, [%[block], #16] \n\t"
|
||||
"wunpckehub wr9, wr5 \n\t"
|
||||
"wldrd wr3, [%[block], #24] \n\t"
|
||||
"add %[block], %[block], #32 \n\t"
|
||||
"waddhss wr10, wr0, wr6 \n\t"
|
||||
"waddhss wr11, wr1, wr7 \n\t"
|
||||
"waddhss wr12, wr2, wr8 \n\t"
|
||||
"waddhss wr13, wr3, wr9 \n\t"
|
||||
"wpackhus wr14, wr10, wr11 \n\t"
|
||||
"wpackhus wr15, wr12, wr13 \n\t"
|
||||
"wstrd wr14, [%[pixels]] \n\t"
|
||||
"add %[pixels], %[pixels], %[line_size2] \n\t"
|
||||
"subs r12, r12, #1 \n\t"
|
||||
"wstrd wr15, [%[pixels2]] \n\t"
|
||||
"add %[pixels2], %[pixels2], %[line_size2] \n\t"
|
||||
"bne 1b \n\t"
|
||||
: [block]"+r"(block), [pixels]"+r"(pixels), [pixels2]"+r"(pixels2)
|
||||
: [line_size2]"r"(line_size << 1)
|
||||
: "cc", "memory", "r12");
|
||||
}
|
||||
|
||||
static void nop(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
c->add_pixels_clamped = add_pixels_clamped_iwmmxt;
|
||||
|
||||
c->put_pixels_tab[0][0] = put_pixels16_iwmmxt;
|
||||
c->put_pixels_tab[0][1] = put_pixels16_x2_iwmmxt;
|
||||
c->put_pixels_tab[0][2] = put_pixels16_y2_iwmmxt;
|
||||
c->put_pixels_tab[0][3] = put_pixels16_xy2_iwmmxt;
|
||||
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_iwmmxt;
|
||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_iwmmxt;
|
||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_iwmmxt;
|
||||
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_iwmmxt;
|
||||
|
||||
c->put_pixels_tab[1][0] = put_pixels8_iwmmxt;
|
||||
c->put_pixels_tab[1][1] = put_pixels8_x2_iwmmxt;
|
||||
c->put_pixels_tab[1][2] = put_pixels8_y2_iwmmxt;
|
||||
c->put_pixels_tab[1][3] = put_pixels8_xy2_iwmmxt;
|
||||
c->put_no_rnd_pixels_tab[1][0] = put_pixels8_iwmmxt;
|
||||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_iwmmxt;
|
||||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_iwmmxt;
|
||||
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_iwmmxt;
|
||||
|
||||
c->avg_pixels_tab[0][0] = avg_pixels16_iwmmxt;
|
||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_iwmmxt;
|
||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_iwmmxt;
|
||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_iwmmxt;
|
||||
c->avg_no_rnd_pixels_tab[0][0] = avg_pixels16_iwmmxt;
|
||||
c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_iwmmxt;
|
||||
c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_iwmmxt;
|
||||
c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_iwmmxt;
|
||||
|
||||
c->avg_pixels_tab[1][0] = avg_pixels8_iwmmxt;
|
||||
c->avg_pixels_tab[1][1] = avg_pixels8_x2_iwmmxt;
|
||||
c->avg_pixels_tab[1][2] = avg_pixels8_y2_iwmmxt;
|
||||
c->avg_pixels_tab[1][3] = avg_pixels8_xy2_iwmmxt;
|
||||
c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_iwmmxt;
|
||||
c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_iwmmxt;
|
||||
c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_iwmmxt;
|
||||
c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_iwmmxt;
|
||||
}
|
1093
libavcodec/armv4l/dsputil_iwmmxt_rnd.h
Normal file
1093
libavcodec/armv4l/dsputil_iwmmxt_rnd.h
Normal file
File diff suppressed because it is too large
Load Diff
@ -21,6 +21,13 @@
|
||||
#include "../mpegvideo.h"
|
||||
#include "../avcodec.h"
|
||||
|
||||
#ifdef HAVE_IWMMXT
|
||||
extern void MPV_common_init_iwmmxt(MpegEncContext *s);
|
||||
#endif
|
||||
|
||||
void MPV_common_init_armv4l(MpegEncContext *s)
|
||||
{
|
||||
#ifdef HAVE_IWMMXT
|
||||
MPV_common_init_iwmmxt(s);
|
||||
#endif
|
||||
}
|
||||
|
97
libavcodec/armv4l/mpegvideo_iwmmxt.c
Normal file
97
libavcodec/armv4l/mpegvideo_iwmmxt.c
Normal file
@ -0,0 +1,97 @@
|
||||
#include "../dsputil.h"
|
||||
#include "../mpegvideo.h"
|
||||
#include "../avcodec.h"
|
||||
|
||||
static void dct_unquantize_h263_intra_iwmmxt(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
int level, qmul, qadd;
|
||||
int nCoeffs;
|
||||
DCTELEM *block_orig = block;
|
||||
|
||||
assert(s->block_last_index[n]>=0);
|
||||
|
||||
qmul = qscale << 1;
|
||||
|
||||
if (!s->h263_aic) {
|
||||
if (n < 4)
|
||||
level = block[0] * s->y_dc_scale;
|
||||
else
|
||||
level = block[0] * s->c_dc_scale;
|
||||
qadd = (qscale - 1) | 1;
|
||||
}else{
|
||||
qadd = 0;
|
||||
level = block[0];
|
||||
}
|
||||
if(s->ac_pred)
|
||||
nCoeffs=63;
|
||||
else
|
||||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
__asm__ __volatile__ (
|
||||
/* "movd %1, %%mm6 \n\t" //qmul */
|
||||
/* "packssdw %%mm6, %%mm6 \n\t" */
|
||||
/* "packssdw %%mm6, %%mm6 \n\t" */
|
||||
"tbcsth wr6, %[qmul] \n\t"
|
||||
/* "movd %2, %%mm5 \n\t" //qadd */
|
||||
/* "packssdw %%mm5, %%mm5 \n\t" */
|
||||
/* "packssdw %%mm5, %%mm5 \n\t" */
|
||||
"tbcsth wr5, %[qadd] \n\t"
|
||||
"wzero wr7 \n\t" /* "pxor %%mm7, %%mm7 \n\t" */
|
||||
"wzero wr4 \n\t" /* "pxor %%mm4, %%mm4 \n\t" */
|
||||
"wsubh wr7, wr5, wr7 \n\t" /* "psubw %%mm5, %%mm7 \n\t" */
|
||||
"1: \n\t"
|
||||
"wldrd wr2, [%[block]] \n\t" /* "movq (%0, %3), %%mm0 \n\t" */
|
||||
"wldrd wr3, [%[block], #8] \n\t" /* "movq 8(%0, %3), %%mm1 \n\t" */
|
||||
"wmulsl wr0, wr6, wr2 \n\t" /* "pmullw %%mm6, %%mm0 \n\t" */
|
||||
"wmulsl wr1, wr6, wr3 \n\t" /* "pmullw %%mm6, %%mm1 \n\t" */
|
||||
/* "movq (%0, %3), %%mm2 \n\t" */
|
||||
/* "movq 8(%0, %3), %%mm3 \n\t" */
|
||||
"wcmpgtsh wr2, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 */
|
||||
"wcmpgtsh wr3, wr4, wr2 \n\t" /* "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 */
|
||||
"wxor wr0, wr2, wr0 \n\t" /* "pxor %%mm2, %%mm0 \n\t" */
|
||||
"wxor wr1, wr3, wr1 \n\t" /* "pxor %%mm3, %%mm1 \n\t" */
|
||||
"waddh wr0, wr7, wr0 \n\t" /* "paddw %%mm7, %%mm0 \n\t" */
|
||||
"waddh wr1, wr7, wr1 \n\t" /* "paddw %%mm7, %%mm1 \n\t" */
|
||||
"wxor wr2, wr0, wr2 \n\t" /* "pxor %%mm0, %%mm2 \n\t" */
|
||||
"wxor wr3, wr1, wr3 \n\t" /* "pxor %%mm1, %%mm3 \n\t" */
|
||||
"wcmpeqh wr0, wr7, wr0 \n\t" /* "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 */
|
||||
"wcmpeqh wr1, wr7, wr1 \n\t" /* "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 */
|
||||
"wandn wr0, wr2, wr0 \n\t" /* "pandn %%mm2, %%mm0 \n\t" */
|
||||
"wandn wr1, wr3, wr1 \n\t" /* "pandn %%mm3, %%mm1 \n\t" */
|
||||
"wstrd wr0, [%[block]] \n\t" /* "movq %%mm0, (%0, %3) \n\t" */
|
||||
"wstrd wr1, [%[block], #8] \n\t" /* "movq %%mm1, 8(%0, %3) \n\t" */
|
||||
"add %[block], %[block], #16 \n\t" /* "addl $16, %3 \n\t" */
|
||||
"subs %[i], %[i], #1 \n\t"
|
||||
"bne 1b \n\t" /* "jng 1b \n\t" */
|
||||
:[block]"+r"(block)
|
||||
:[i]"r"((nCoeffs + 8) / 8), [qmul]"r"(qmul), [qadd]"r"(qadd)
|
||||
:"memory");
|
||||
|
||||
block_orig[0] = level;
|
||||
}
|
||||
|
||||
#if 0
|
||||
static void dct_unquantize_h263_inter_iwmmxt(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
int nCoeffs;
|
||||
|
||||
assert(s->block_last_index[n]>=0);
|
||||
|
||||
if(s->ac_pred)
|
||||
nCoeffs=63;
|
||||
else
|
||||
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
|
||||
|
||||
ippiQuantInvInter_Compact_H263_16s_I(block, nCoeffs+1, qscale);
|
||||
}
|
||||
#endif
|
||||
|
||||
void MPV_common_init_iwmmxt(MpegEncContext *s)
|
||||
{
|
||||
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_iwmmxt;
|
||||
#if 0
|
||||
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_iwmmxt;
|
||||
#endif
|
||||
}
|
@ -1180,6 +1180,7 @@ typedef struct AVCodecContext {
|
||||
#define FF_IDCT_SIMPLEARM 10
|
||||
#define FF_IDCT_H264 11
|
||||
#define FF_IDCT_VP3 12
|
||||
#define FP_IDCT_IPP 13
|
||||
|
||||
/**
|
||||
* slice count.
|
||||
|
@ -94,10 +94,23 @@ static always_inline uint16_t bswap_16(uint16_t x){
|
||||
return (x>>8) | (x<<8);
|
||||
}
|
||||
|
||||
#ifdef ARCH_ARM
|
||||
static always_inline uint32_t bswap_32(uint32_t x){
|
||||
uint32_t t;
|
||||
__asm__ (
|
||||
"eor %1, %0, %0, ror #16 \n\t"
|
||||
"bic %1, %1, #0xFF0000 \n\t"
|
||||
"mov %0, %0, ror #8 \n\t"
|
||||
"eor %0, %0, %1, lsr #8 \n\t"
|
||||
: "+r"(x), "+r"(t));
|
||||
return x;
|
||||
}
|
||||
#else
|
||||
static always_inline uint32_t bswap_32(uint32_t x){
|
||||
x= ((x<<8)&0xFF00FF00) | ((x>>8)&0x00FF00FF);
|
||||
return (x>>16) | (x<<16);
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline uint64_t bswap_64(uint64_t x)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user