md5-sparcv9.pl: avoid %asi modifications, improve short input performance
by 30-20%.
This commit is contained in:
parent
aea4126e4e
commit
d17b59e49f
@ -12,7 +12,7 @@
|
||||
# MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
|
||||
# code generated by Sun C 5.2.
|
||||
|
||||
# SPARC T4 MD5 hardware achieves 3.24 cycles per byte, which is 2.1x
|
||||
# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
|
||||
# faster than software. Multi-process benchmark saturates at 12x
|
||||
# single-process result on 8-core processor, or ~11GBps per 2.85GHz
|
||||
# socket.
|
||||
@ -221,15 +221,15 @@ md5_block_asm_data_order:
|
||||
be .Lsoftware
|
||||
nop
|
||||
|
||||
rd %asi, %g5
|
||||
wr %g0, 0x88, %asi ! ASI_PRIMARY_LITTLE
|
||||
|
||||
lda [%o0 + 0x00] %asi, %f0 ! load context
|
||||
lda [%o0 + 0x04] %asi, %f1
|
||||
mov 4, %g1
|
||||
andcc %o1, 0x7, %g0
|
||||
lda [%o0 + 0x08] %asi, %f2
|
||||
lda [%o0 + %g0]0x88, %f0 ! load context
|
||||
lda [%o0 + %g1]0x88, %f1
|
||||
add %o0, 8, %o0
|
||||
lda [%o0 + %g0]0x88, %f2
|
||||
lda [%o0 + %g1]0x88, %f3
|
||||
bne,pn %icc, .Lhwunaligned
|
||||
lda [%o0 + 0x0c] %asi, %f3
|
||||
sub %o0, 8, %o0
|
||||
|
||||
.Lhw_loop:
|
||||
ldd [%o1 + 0x00], %f8
|
||||
@ -250,12 +250,13 @@ md5_block_asm_data_order:
|
||||
nop
|
||||
|
||||
.Lhwfinish:
|
||||
sta %f0, [%o0 + 0x00] %asi ! store context
|
||||
sta %f1, [%o0 + 0x04] %asi
|
||||
sta %f2, [%o0 + 0x08] %asi
|
||||
sta %f3, [%o0 + 0x0c] %asi
|
||||
sta %f0, [%o0 + %g0]0x88 ! store context
|
||||
sta %f1, [%o0 + %g1]0x88
|
||||
add %o0, 8, %o0
|
||||
sta %f2, [%o0 + %g0]0x88
|
||||
sta %f3, [%o0 + %g1]0x88
|
||||
retl
|
||||
wr %g5, 0x0, %asi ! restore %asi
|
||||
nop
|
||||
|
||||
.align 8
|
||||
.Lhwunaligned:
|
||||
|
Loading…
x
Reference in New Issue
Block a user