Add 0.9.7 specific comments to RC4 assembler modules.
This commit is contained in:
parent
e6e1f4cb5e
commit
b7b46c9a87
@ -1,7 +1,7 @@
|
|||||||
#!/usr/local/bin/perl
|
#!/usr/local/bin/perl
|
||||||
|
|
||||||
# At some point it became apparent that the original SSLeay RC4
|
# At some point it became apparent that the original SSLeay RC4
|
||||||
# assembler implementation performs suboptimal on latest IA-32
|
# assembler implementation performs suboptimaly on latest IA-32
|
||||||
# microarchitectures. After re-tuning performance has changed as
|
# microarchitectures. After re-tuning performance has changed as
|
||||||
# following:
|
# following:
|
||||||
#
|
#
|
||||||
@ -15,10 +15,12 @@
|
|||||||
# In other words code performing further 13% faster on AMD
|
# In other words code performing further 13% faster on AMD
|
||||||
# would perform almost 2 times slower on Intel PIII...
|
# would perform almost 2 times slower on Intel PIII...
|
||||||
# For reference! This code delivers ~80% of rc4-amd64.pl
|
# For reference! This code delivers ~80% of rc4-amd64.pl
|
||||||
# performance on same Opteron machine.
|
# performance on the same Opteron machine.
|
||||||
# (**) This number requires compressed key schedule set up by
|
# (**) This number requires compressed key schedule set up by
|
||||||
# RC4_set_key, see commentary section in rc4_skey.c for
|
# RC4_set_key and therefore doesn't apply to 0.9.7 [option for
|
||||||
# further details.
|
# compressed key schedule is implemented in 0.9.8 and later,
|
||||||
|
# see commentary section in rc4_skey.c for further details].
|
||||||
|
#
|
||||||
# <appro@fy.chalmers.se>
|
# <appro@fy.chalmers.se>
|
||||||
|
|
||||||
push(@INC,"perlasm","../../perlasm");
|
push(@INC,"perlasm","../../perlasm");
|
||||||
@ -130,6 +132,8 @@ sub RC4
|
|||||||
&add( $d, 8);
|
&add( $d, 8);
|
||||||
|
|
||||||
# detect compressed schedule, see commentary section in rc4_skey.c...
|
# detect compressed schedule, see commentary section in rc4_skey.c...
|
||||||
|
# in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
|
||||||
|
# as compressed key schedule is set up in 0.9.8 and later.
|
||||||
&cmp(&DWP(256,$d),-1);
|
&cmp(&DWP(256,$d),-1);
|
||||||
&je(&label("RC4_CHAR"));
|
&je(&label("RC4_CHAR"));
|
||||||
|
|
||||||
@ -190,7 +194,8 @@ sub RC4
|
|||||||
&jmp(&label("finished"));
|
&jmp(&label("finished"));
|
||||||
|
|
||||||
&align(16);
|
&align(16);
|
||||||
# this is essentially Intel P4 specific codepath, see rc4_skey.c...
|
# this is essentially Intel P4 specific codepath, see rc4_skey.c,
|
||||||
|
# and is engaged in 0.9.8 and later context...
|
||||||
&set_label("RC4_CHAR");
|
&set_label("RC4_CHAR");
|
||||||
|
|
||||||
&lea ($ty,&DWP(0,$in,$ty));
|
&lea ($ty,&DWP(0,$in,$ty));
|
||||||
|
@ -30,7 +30,9 @@
|
|||||||
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
|
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
|
||||||
# compose blended code, which would perform even within 30% marginal
|
# compose blended code, which would perform even within 30% marginal
|
||||||
# on either AMD and Intel platforms, I implement both cases. See
|
# on either AMD and Intel platforms, I implement both cases. See
|
||||||
# rc4_skey.c for further details...
|
# rc4_skey.c for further details... This applies to 0.9.8 and later.
|
||||||
|
# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes
|
||||||
|
# of code remain redundant.
|
||||||
|
|
||||||
$output=shift;
|
$output=shift;
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
// to input and output streams. Secondly, less obvious, it's possible
|
// to input and output streams. Secondly, less obvious, it's possible
|
||||||
// to pull up some references to elements of the key schedule itself.
|
// to pull up some references to elements of the key schedule itself.
|
||||||
// Fact is that such prior loads are not safe only for "degenerated"
|
// Fact is that such prior loads are not safe only for "degenerated"
|
||||||
// key schedule, when all elements equal to the same value, which is
|
// key schedule, when some elements equal to the same value, which is
|
||||||
// never the case [key schedule setup routine makes sure it's not].
|
// never the case [key schedule setup routine makes sure it's not].
|
||||||
// Furthermore. In order to compress loop body to the minimum, I chose
|
// Furthermore. In order to compress loop body to the minimum, I chose
|
||||||
// to deploy deposit instruction, which substitutes for the whole
|
// to deploy deposit instruction, which substitutes for the whole
|
||||||
|
Loading…
x
Reference in New Issue
Block a user