aes/asm/bsaes-x86_64.pl: Atom-specific optimization.
This commit is contained in:
parent
94d1f4b0f3
commit
558ff0f0c1
@ -38,8 +38,8 @@
|
|||||||
# Emilia's this(*) difference
|
# Emilia's this(*) difference
|
||||||
#
|
#
|
||||||
# Core 2 9.30 8.69 +7%
|
# Core 2 9.30 8.69 +7%
|
||||||
# Nehalem(**) 7.63 6.98 +9%
|
# Nehalem(**) 7.63 6.88 +11%
|
||||||
# Atom 17.1 17.4 -2%(***)
|
# Atom 17.1 16.4 +4%
|
||||||
#
|
#
|
||||||
# (*) Comparison is not completely fair, because "this" is ECB,
|
# (*) Comparison is not completely fair, because "this" is ECB,
|
||||||
# i.e. no extra processing such as counter values calculation
|
# i.e. no extra processing such as counter values calculation
|
||||||
@ -50,14 +50,6 @@
|
|||||||
# (**) Results were collected on Westmere, which is considered to
|
# (**) Results were collected on Westmere, which is considered to
|
||||||
# be equivalent to Nehalem for this code.
|
# be equivalent to Nehalem for this code.
|
||||||
#
|
#
|
||||||
# (***) Slowdown on Atom is rather strange per se, because original
|
|
||||||
# implementation has a number of 9+-bytes instructions, which
|
|
||||||
# are bad for Atom front-end, and which I eliminated completely.
|
|
||||||
# In attempt to address deterioration sbox() was tested in FP
|
|
||||||
# SIMD "domain" (movaps instead of movdqa, xorps instead of
|
|
||||||
# pxor, etc.). While it resulted in nominal 4% improvement on
|
|
||||||
# Atom, it hurted Westmere by more than 2x factor.
|
|
||||||
#
|
|
||||||
# As for key schedule conversion subroutine. Interface to OpenSSL
|
# As for key schedule conversion subroutine. Interface to OpenSSL
|
||||||
# relies on per-invocation on-the-fly conversion. This naturally
|
# relies on per-invocation on-the-fly conversion. This naturally
|
||||||
# has impact on performance, especially for short inputs. Conversion
|
# has impact on performance, especially for short inputs. Conversion
|
||||||
@ -67,7 +59,7 @@
|
|||||||
# conversion conversion/8x block
|
# conversion conversion/8x block
|
||||||
# Core 2 240 0.22
|
# Core 2 240 0.22
|
||||||
# Nehalem 180 0.20
|
# Nehalem 180 0.20
|
||||||
# Atom 430 0.19
|
# Atom 430 0.20
|
||||||
#
|
#
|
||||||
# The ratio values mean that 128-byte blocks will be processed
|
# The ratio values mean that 128-byte blocks will be processed
|
||||||
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
|
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
|
||||||
@ -83,9 +75,9 @@
|
|||||||
# Add decryption procedure. Performance in CPU cycles spent to decrypt
|
# Add decryption procedure. Performance in CPU cycles spent to decrypt
|
||||||
# one byte out of 4096-byte buffer with 128-bit key is:
|
# one byte out of 4096-byte buffer with 128-bit key is:
|
||||||
#
|
#
|
||||||
# Core 2 9.83
|
# Core 2 9.98
|
||||||
# Nehalem 7.74
|
# Nehalem 7.80
|
||||||
# Atom 19.0
|
# Atom 17.9
|
||||||
#
|
#
|
||||||
# November 2011.
|
# November 2011.
|
||||||
#
|
#
|
||||||
@ -434,21 +426,21 @@ my $mask=pop;
|
|||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
pxor 0x00($key),@x[0]
|
pxor 0x00($key),@x[0]
|
||||||
pxor 0x10($key),@x[1]
|
pxor 0x10($key),@x[1]
|
||||||
pshufb $mask,@x[0]
|
|
||||||
pxor 0x20($key),@x[2]
|
pxor 0x20($key),@x[2]
|
||||||
pshufb $mask,@x[1]
|
|
||||||
pxor 0x30($key),@x[3]
|
pxor 0x30($key),@x[3]
|
||||||
pshufb $mask,@x[2]
|
pshufb $mask,@x[0]
|
||||||
|
pshufb $mask,@x[1]
|
||||||
pxor 0x40($key),@x[4]
|
pxor 0x40($key),@x[4]
|
||||||
pshufb $mask,@x[3]
|
|
||||||
pxor 0x50($key),@x[5]
|
pxor 0x50($key),@x[5]
|
||||||
pshufb $mask,@x[4]
|
pshufb $mask,@x[2]
|
||||||
|
pshufb $mask,@x[3]
|
||||||
pxor 0x60($key),@x[6]
|
pxor 0x60($key),@x[6]
|
||||||
pshufb $mask,@x[5]
|
|
||||||
pxor 0x70($key),@x[7]
|
pxor 0x70($key),@x[7]
|
||||||
|
pshufb $mask,@x[4]
|
||||||
|
pshufb $mask,@x[5]
|
||||||
pshufb $mask,@x[6]
|
pshufb $mask,@x[6]
|
||||||
lea 0x80($key),$key
|
|
||||||
pshufb $mask,@x[7]
|
pshufb $mask,@x[7]
|
||||||
|
lea 0x80($key),$key
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -820,18 +812,18 @@ _bsaes_encrypt8:
|
|||||||
movdqa 0x50($const), @XMM[8] # .LM0SR
|
movdqa 0x50($const), @XMM[8] # .LM0SR
|
||||||
pxor @XMM[9], @XMM[0] # xor with round0 key
|
pxor @XMM[9], @XMM[0] # xor with round0 key
|
||||||
pxor @XMM[9], @XMM[1]
|
pxor @XMM[9], @XMM[1]
|
||||||
pshufb @XMM[8], @XMM[0]
|
|
||||||
pxor @XMM[9], @XMM[2]
|
pxor @XMM[9], @XMM[2]
|
||||||
pshufb @XMM[8], @XMM[1]
|
|
||||||
pxor @XMM[9], @XMM[3]
|
pxor @XMM[9], @XMM[3]
|
||||||
pshufb @XMM[8], @XMM[2]
|
pshufb @XMM[8], @XMM[0]
|
||||||
|
pshufb @XMM[8], @XMM[1]
|
||||||
pxor @XMM[9], @XMM[4]
|
pxor @XMM[9], @XMM[4]
|
||||||
pshufb @XMM[8], @XMM[3]
|
|
||||||
pxor @XMM[9], @XMM[5]
|
pxor @XMM[9], @XMM[5]
|
||||||
pshufb @XMM[8], @XMM[4]
|
pshufb @XMM[8], @XMM[2]
|
||||||
|
pshufb @XMM[8], @XMM[3]
|
||||||
pxor @XMM[9], @XMM[6]
|
pxor @XMM[9], @XMM[6]
|
||||||
pshufb @XMM[8], @XMM[5]
|
|
||||||
pxor @XMM[9], @XMM[7]
|
pxor @XMM[9], @XMM[7]
|
||||||
|
pshufb @XMM[8], @XMM[4]
|
||||||
|
pshufb @XMM[8], @XMM[5]
|
||||||
pshufb @XMM[8], @XMM[6]
|
pshufb @XMM[8], @XMM[6]
|
||||||
pshufb @XMM[8], @XMM[7]
|
pshufb @XMM[8], @XMM[7]
|
||||||
_bsaes_encrypt8_bitslice:
|
_bsaes_encrypt8_bitslice:
|
||||||
@ -884,18 +876,18 @@ _bsaes_decrypt8:
|
|||||||
movdqa -0x30($const), @XMM[8] # .LM0ISR
|
movdqa -0x30($const), @XMM[8] # .LM0ISR
|
||||||
pxor @XMM[9], @XMM[0] # xor with round0 key
|
pxor @XMM[9], @XMM[0] # xor with round0 key
|
||||||
pxor @XMM[9], @XMM[1]
|
pxor @XMM[9], @XMM[1]
|
||||||
pshufb @XMM[8], @XMM[0]
|
|
||||||
pxor @XMM[9], @XMM[2]
|
pxor @XMM[9], @XMM[2]
|
||||||
pshufb @XMM[8], @XMM[1]
|
|
||||||
pxor @XMM[9], @XMM[3]
|
pxor @XMM[9], @XMM[3]
|
||||||
pshufb @XMM[8], @XMM[2]
|
pshufb @XMM[8], @XMM[0]
|
||||||
|
pshufb @XMM[8], @XMM[1]
|
||||||
pxor @XMM[9], @XMM[4]
|
pxor @XMM[9], @XMM[4]
|
||||||
pshufb @XMM[8], @XMM[3]
|
|
||||||
pxor @XMM[9], @XMM[5]
|
pxor @XMM[9], @XMM[5]
|
||||||
pshufb @XMM[8], @XMM[4]
|
pshufb @XMM[8], @XMM[2]
|
||||||
|
pshufb @XMM[8], @XMM[3]
|
||||||
pxor @XMM[9], @XMM[6]
|
pxor @XMM[9], @XMM[6]
|
||||||
pshufb @XMM[8], @XMM[5]
|
|
||||||
pxor @XMM[9], @XMM[7]
|
pxor @XMM[9], @XMM[7]
|
||||||
|
pshufb @XMM[8], @XMM[4]
|
||||||
|
pshufb @XMM[8], @XMM[5]
|
||||||
pshufb @XMM[8], @XMM[6]
|
pshufb @XMM[8], @XMM[6]
|
||||||
pshufb @XMM[8], @XMM[7]
|
pshufb @XMM[8], @XMM[7]
|
||||||
___
|
___
|
||||||
@ -1937,21 +1929,21 @@ $code.=<<___;
|
|||||||
movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
|
movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
|
||||||
pxor @XMM[9], @XMM[0] # xor with round0 key
|
pxor @XMM[9], @XMM[0] # xor with round0 key
|
||||||
pxor @XMM[9], @XMM[1]
|
pxor @XMM[9], @XMM[1]
|
||||||
pshufb @XMM[8], @XMM[0]
|
|
||||||
pxor @XMM[9], @XMM[2]
|
pxor @XMM[9], @XMM[2]
|
||||||
pshufb @XMM[8], @XMM[1]
|
|
||||||
pxor @XMM[9], @XMM[3]
|
pxor @XMM[9], @XMM[3]
|
||||||
pshufb @XMM[8], @XMM[2]
|
pshufb @XMM[8], @XMM[0]
|
||||||
|
pshufb @XMM[8], @XMM[1]
|
||||||
pxor @XMM[9], @XMM[4]
|
pxor @XMM[9], @XMM[4]
|
||||||
pshufb @XMM[8], @XMM[3]
|
|
||||||
pxor @XMM[9], @XMM[5]
|
pxor @XMM[9], @XMM[5]
|
||||||
pshufb @XMM[8], @XMM[4]
|
pshufb @XMM[8], @XMM[2]
|
||||||
|
pshufb @XMM[8], @XMM[3]
|
||||||
pxor @XMM[9], @XMM[6]
|
pxor @XMM[9], @XMM[6]
|
||||||
pshufb @XMM[8], @XMM[5]
|
|
||||||
pxor @XMM[9], @XMM[7]
|
pxor @XMM[9], @XMM[7]
|
||||||
|
pshufb @XMM[8], @XMM[4]
|
||||||
|
pshufb @XMM[8], @XMM[5]
|
||||||
pshufb @XMM[8], @XMM[6]
|
pshufb @XMM[8], @XMM[6]
|
||||||
lea .LBS0(%rip), %r11 # constants table
|
|
||||||
pshufb @XMM[8], @XMM[7]
|
pshufb @XMM[8], @XMM[7]
|
||||||
|
lea .LBS0(%rip), %r11 # constants table
|
||||||
mov %ebx,%r10d # pass rounds
|
mov %ebx,%r10d # pass rounds
|
||||||
|
|
||||||
call _bsaes_encrypt8_bitslice
|
call _bsaes_encrypt8_bitslice
|
||||||
|
Loading…
x
Reference in New Issue
Block a user