Bug-fix in CBC encrypt tail processing and commentary section update.
This commit is contained in:
parent
a963395a7b
commit
bac252a5e3
@ -6,7 +6,7 @@
|
|||||||
# forms are granted according to the OpenSSL license.
|
# forms are granted according to the OpenSSL license.
|
||||||
# ====================================================================
|
# ====================================================================
|
||||||
#
|
#
|
||||||
# Version 3.0.
|
# Version 3.1.
|
||||||
#
|
#
|
||||||
# You might fail to appreciate this module performance from the first
|
# You might fail to appreciate this module performance from the first
|
||||||
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
|
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
|
||||||
@ -46,23 +46,27 @@
|
|||||||
# Instruction Level Parallelism, and it indeed resulted in up to 15%
|
# Instruction Level Parallelism, and it indeed resulted in up to 15%
|
||||||
# better performance on most recent µ-archs...
|
# better performance on most recent µ-archs...
|
||||||
#
|
#
|
||||||
# Current ECB performance numbers for 128-bit key in cycles per byte
|
# Current ECB performance numbers for 128-bit key in CPU cycles per
|
||||||
# [measure commonly used by AES benchmarkers] are:
|
# processed byte [measure commonly used by AES benchmarkers] are:
|
||||||
#
|
#
|
||||||
# small footprint fully unrolled
|
# small footprint fully unrolled
|
||||||
# P4[-3] 23[24] 22[23]
|
# P4[-3] 23[24] 22[23]
|
||||||
# AMD K8 19 18
|
# AMD K8 19 18
|
||||||
# PIII 26(*) 23
|
# PIII 26 23
|
||||||
# Pentium 63(*) 52
|
# Pentium 63(*) 52
|
||||||
#
|
#
|
||||||
# (*) Performance difference between small footprint code and fully
|
# (*) Performance difference between small footprint code and fully
|
||||||
# unrolled in more commonly used CBC mode is not as big, 7% for
|
# unrolled in more commonly used CBC mode is not as big, 4% for
|
||||||
# PIII and 15% for Pentium, which I consider tolerable.
|
# for Pentium. PIII's ~13% difference [in both cases in 3rd
|
||||||
|
# version] is considered tolerable...
|
||||||
#
|
#
|
||||||
# Third version adds AES_cbc_encrypt implementation, which resulted in
|
# Third version adds AES_cbc_encrypt implementation, which resulted in
|
||||||
# up to 40% performance imrovement of CBC benchmark results [on most
|
# up to 40% performance imrovement of CBC benchmark results. 40% was
|
||||||
# recent µ-archs]. CBC performance is virtually as good as ECB now and
|
# observed on P4 core, where "overall" imrovement coefficient, i.e. if
|
||||||
# sometimes even better, because function prologues and epilogues are
|
# compared to PIC generated by GCC and in CBC mode, was observed to be
|
||||||
|
# as large as 4x:-) CBC performance is virtually identical to ECB now
|
||||||
|
# and on some platforms even better, e.g. 56 "small" cycles/byte on
|
||||||
|
# senior Pentium, because certain function prologues and epilogues are
|
||||||
# effectively taken out of the loop...
|
# effectively taken out of the loop...
|
||||||
|
|
||||||
push(@INC,"perlasm","../../perlasm");
|
push(@INC,"perlasm","../../perlasm");
|
||||||
@ -79,8 +83,9 @@ $acc="esi";
|
|||||||
|
|
||||||
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
|
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
|
||||||
# recent µ-archs], but ~5 times smaller!
|
# recent µ-archs], but ~5 times smaller!
|
||||||
# I favor compact code, because it minimizes
|
# I favor compact code to minimize cache
|
||||||
# cache contention...
|
# contention and in hope to "collect" 5% back
|
||||||
|
# in real-life applications...
|
||||||
$vertical_spin=0; # shift "verticaly" defaults to 0, because of
|
$vertical_spin=0; # shift "verticaly" defaults to 0, because of
|
||||||
# its proof-of-concept status...
|
# its proof-of-concept status...
|
||||||
|
|
||||||
@ -1296,12 +1301,18 @@ sub declast()
|
|||||||
&push ($key eq "edi" ? $key : ""); # push ivp
|
&push ($key eq "edi" ? $key : ""); # push ivp
|
||||||
&pushf ();
|
&pushf ();
|
||||||
&mov ($key,&wparam(1)); # load out
|
&mov ($key,&wparam(1)); # load out
|
||||||
&xor ($s0,$s0);
|
&mov ($s1,16);
|
||||||
&mov (&DWP(0,$key),$s0); # zero output
|
&sub ($s1,$s2);
|
||||||
&mov (&DWP(4,$key),$s0);
|
&cmp ($key,$acc); # compare with inp
|
||||||
&mov (&DWP(8,$key),$s0);
|
&je (&label("enc_in_place"));
|
||||||
&mov (&DWP(12,$key),$s0);
|
|
||||||
&data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input
|
&data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input
|
||||||
|
&jmp (&label("enc_skip_in_place"));
|
||||||
|
&set_label("enc_in_place");
|
||||||
|
&lea ($key,&DWP(0,$key,$s2));
|
||||||
|
&set_label("enc_skip_in_place");
|
||||||
|
&mov ($s2,$s1);
|
||||||
|
&xor ($s0,$s0);
|
||||||
|
&data_word(0x90AAF3FC); # cld; rep stosb; nop # zero tail
|
||||||
&popf ();
|
&popf ();
|
||||||
&pop ($key); # pop ivp
|
&pop ($key); # pop ivp
|
||||||
|
|
||||||
@ -1456,6 +1467,8 @@ sub declast()
|
|||||||
&pushf ();
|
&pushf ();
|
||||||
&data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail
|
&data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail
|
||||||
&popf ();
|
&popf ();
|
||||||
|
|
||||||
|
&align (4);
|
||||||
&set_label("dec_out");
|
&set_label("dec_out");
|
||||||
&stack_pop(5);
|
&stack_pop(5);
|
||||||
&function_end("AES_cbc_encrypt");
|
&function_end("AES_cbc_encrypt");
|
||||||
|
Loading…
Reference in New Issue
Block a user