md5-sparcv9.pl: avoid %asi modifications, improve short input performance

by 30-20%.
2012-10-14 16:51:27 +00:00
parent aea4126e4e
commit d17b59e49f
1 changed files with 14 additions and 13 deletions
--- a/crypto/md5/asm/md5-sparcv9.pl
+++ b/crypto/md5/asm/md5-sparcv9.pl
@@ -12,7 +12,7 @@
 # MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
 # code generated by Sun C 5.2.
-# SPARC T4 MD5 hardware achieves 3.24 cycles per byte, which is 2.1x
+# SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
 # faster than software. Multi-process benchmark saturates at 12x
 # single-process result on 8-core processor, or ~11GBps per 2.85GHz
 # socket.
@@ -221,15 +221,15 @@ md5_block_asm_data_order:
 	be	.Lsoftware
 	nop
- 	rd	%asi, %g5
+	mov	4, %g1
 	wr	%g0, 0x88, %asi		! ASI_PRIMARY_LITTLE
 	lda	[%o0 + 0x00] %asi, %f0	! load context
 	lda	[%o0 + 0x04] %asi, %f1
 	andcc	%o1, 0x7, %g0
-	lda	[%o0 + 0x08] %asi, %f2
+	lda	[%o0 + %g0]0x88, %f0		! load context
 	lda	[%o0 + %g1]0x88, %f1
 	add	%o0, 8, %o0
 	lda	[%o0 + %g0]0x88, %f2
 	lda	[%o0 + %g1]0x88, %f3
 	bne,pn	%icc, .Lhwunaligned
-	 lda	[%o0 + 0x0c] %asi, %f3
+	sub	%o0, 8, %o0
 .Lhw_loop:
 	ldd	[%o1 + 0x00], %f8
@@ -250,12 +250,13 @@ md5_block_asm_data_order:
 	nop
 .Lhwfinish:
-	sta	%f0, [%o0 + 0x00] %asi	! store context
+	sta	%f0, [%o0 + %g0]0x88	! store context
-	sta	%f1, [%o0 + 0x04] %asi
+	sta	%f1, [%o0 + %g1]0x88
-	sta	%f2, [%o0 + 0x08] %asi
+	add	%o0, 8, %o0
-	sta	%f3, [%o0 + 0x0c] %asi
+	sta	%f2, [%o0 + %g0]0x88
 	sta	%f3, [%o0 + %g1]0x88
 	retl
-	 wr	%g5, 0x0, %asi		! restore %asi
+	nop
 .align	8
 .Lhwunaligned: