x86 dsputil: provide SSE2/SSSE3 versions of bswap_buf
While pshufb allows emulating bswap on XMM registers for SSSE3, more shuffling is needed for SSE2. Alignment is critical, so specific codepaths are provided for this case. For the huffyuv sequence "angels_480-huffyuvcompress.avi": C (using bswap instruction): ~ 55k cycles SSE2: ~ 40k cycles SSSE3 using unaligned loads: ~ 35k cycles SSSE3 using aligned loads: ~ 30k cycles Signed-off-by: Diego Biurrun <diego@biurrun.de>
This commit is contained in:
parent
a846202343
commit
6b03900382
@ -2313,6 +2313,9 @@ void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
|
||||
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
|
||||
const int16_t *window, unsigned int len);
|
||||
|
||||
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
|
||||
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
|
||||
|
||||
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
|
||||
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||
@ -2798,6 +2801,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
c->apply_window_int16 = ff_apply_window_int16_sse2;
|
||||
}
|
||||
}
|
||||
c->bswap_buf = ff_bswap32_buf_sse2;
|
||||
#endif
|
||||
}
|
||||
if (mm_flags & AV_CPU_FLAG_SSSE3) {
|
||||
@ -2810,6 +2814,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
||||
}
|
||||
c->bswap_buf = ff_bswap32_buf_ssse3;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,7 @@ pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
|
||||
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
|
||||
pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
|
||||
pd_16384: times 4 dd 16384
|
||||
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
|
||||
|
||||
SECTION_TEXT
|
||||
|
||||
@ -1178,3 +1179,125 @@ INIT_XMM sse
|
||||
BUTTERFLIES_FLOAT_INTERLEAVE
|
||||
INIT_YMM avx
|
||||
BUTTERFLIES_FLOAT_INTERLEAVE
|
||||
|
||||
INIT_XMM sse2
|
||||
; %1 = aligned/unaligned
|
||||
%macro BSWAP_LOOPS_SSE2 1
|
||||
mov r3, r2
|
||||
sar r2, 3
|
||||
jz .left4_%1
|
||||
.loop8_%1:
|
||||
mov%1 m0, [r1 + 0]
|
||||
mov%1 m1, [r1 + 16]
|
||||
pshuflw m0, m0, 10110001b
|
||||
pshuflw m1, m1, 10110001b
|
||||
pshufhw m0, m0, 10110001b
|
||||
pshufhw m1, m1, 10110001b
|
||||
mova m2, m0
|
||||
mova m3, m1
|
||||
psllw m0, 8
|
||||
psllw m1, 8
|
||||
psrlw m2, 8
|
||||
psrlw m3, 8
|
||||
por m2, m0
|
||||
por m3, m1
|
||||
mova [r0 + 0], m2
|
||||
mova [r0 + 16], m3
|
||||
add r1, 32
|
||||
add r0, 32
|
||||
dec r2
|
||||
jnz .loop8_%1
|
||||
.left4_%1:
|
||||
mov r2, r3
|
||||
and r3, 4
|
||||
jz .left
|
||||
mov%1 m0, [r1]
|
||||
pshuflw m0, m0, 10110001b
|
||||
pshufhw m0, m0, 10110001b
|
||||
mova m2, m0
|
||||
psllw m0, 8
|
||||
psrlw m2, 8
|
||||
por m2, m0
|
||||
mova [r0], m2
|
||||
add r1, 16
|
||||
add r0, 16
|
||||
%endmacro
|
||||
|
||||
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
|
||||
cglobal bswap32_buf, 3,4,5
|
||||
mov r3, r1
|
||||
and r3, 15
|
||||
jz .start_align
|
||||
BSWAP_LOOPS_SSE2 u
|
||||
jmp .left
|
||||
.start_align:
|
||||
BSWAP_LOOPS_SSE2 a
|
||||
.left:
|
||||
and r2, 3
|
||||
jz .end
|
||||
.loop2:
|
||||
mov r3d, [r1]
|
||||
bswap r3d
|
||||
mov [r0], r3d
|
||||
add r1, 4
|
||||
add r0, 4
|
||||
dec r2
|
||||
jnz .loop2
|
||||
.end
|
||||
RET
|
||||
|
||||
; %1 = aligned/unaligned
|
||||
%macro BSWAP_LOOPS_SSSE3 1
|
||||
mov r3, r2
|
||||
sar r2, 3
|
||||
jz .left4_%1
|
||||
.loop8_%1:
|
||||
mov%1 m0, [r1 + 0]
|
||||
mov%1 m1, [r1 + 16]
|
||||
pshufb m0, m2
|
||||
pshufb m1, m2
|
||||
mova [r0 + 0], m0
|
||||
mova [r0 + 16], m1
|
||||
add r0, 32
|
||||
add r1, 32
|
||||
dec r2
|
||||
jnz .loop8_%1
|
||||
.left4_%1:
|
||||
mov r2, r3
|
||||
and r3, 4
|
||||
jz .left2
|
||||
mov%1 m0, [r1]
|
||||
pshufb m0, m2
|
||||
mova [r0], m0
|
||||
add r1, 16
|
||||
add r0, 16
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
|
||||
cglobal bswap32_buf, 3,4,3
|
||||
mov r3, r1
|
||||
mova m2, [pb_bswap32]
|
||||
and r3, 15
|
||||
jz .start_align
|
||||
BSWAP_LOOPS_SSSE3 u
|
||||
jmp .left2
|
||||
.start_align:
|
||||
BSWAP_LOOPS_SSSE3 a
|
||||
.left2:
|
||||
mov r3, r2
|
||||
and r2, 2
|
||||
jz .left1
|
||||
movq m0, [r1]
|
||||
pshufb m0, m2
|
||||
movq [r0], m0
|
||||
add r1, 8
|
||||
add r0, 8
|
||||
.left1:
|
||||
and r3, 1
|
||||
jz .end
|
||||
mov r2d, [r1]
|
||||
bswap r2d
|
||||
mov [r0], r2d
|
||||
.end:
|
||||
RET
|
||||
|
Loading…
Reference in New Issue
Block a user