e_padlock-x86*.pl: Nano-related update.
This commit is contained in:
parent
4cc2bbab67
commit
149ca7128c
@ -15,14 +15,21 @@
|
|||||||
# mode and ~75% in CBC mode. For aligned data improvement can be
|
# mode and ~75% in CBC mode. For aligned data improvement can be
|
||||||
# observed for short inputs only, e.g. 45% for 64-byte messages in
|
# observed for short inputs only, e.g. 45% for 64-byte messages in
|
||||||
# ECB mode, 20% in CBC. Difference in performance for aligned vs.
|
# ECB mode, 20% in CBC. Difference in performance for aligned vs.
|
||||||
# misaligned data depends on misalignment and is either ~1.8x or
|
# misaligned data depends on misalignment and is either ~1.8x or 2.9x.
|
||||||
# ~2.9x. These are approximately same factors as for hardware support,
|
# These are approximately same factors as for hardware support, so
|
||||||
# so there is little reason to rely on the latter. It might actually
|
# there is little reason to rely on the latter. On the contrary, it
|
||||||
# hurt performance in mixture of aligned and misaligned buffers,
|
# might actually hurt performance in mixture of aligned and misaligned
|
||||||
# because a) if you choose to flip 'align' flag on per-buffer basis,
|
# buffers, because a) if you choose to flip 'align' flag in control
|
||||||
# then you'd have to reload key context; b) if you choose to set
|
# word on per-buffer basis, then you'd have to reload key context,
|
||||||
# 'align' flag permanently, it limits performance for aligned data
|
# which incurs penalty; b) if you choose to set 'align' flag
|
||||||
# to ~1/2. All results were collected on 1.5GHz C7.
|
# permanently, it limits performance even for aligned data to ~1/2.
|
||||||
|
# All above mentioned results were collected on 1.5GHz C7. Nano on the
|
||||||
|
# other hand handles unaligned data more gracefully. Depending on
|
||||||
|
# algorithm and how unaligned data is, hardware can be up to 70% more
|
||||||
|
# efficient than below software alignment procedures, nor does 'align'
|
||||||
|
# flag have affect on aligned performance [if has any meaning at all].
|
||||||
|
# Therefore suggestion is to unconditionally set 'align' flag on Nano
|
||||||
|
# for optimal performance.
|
||||||
|
|
||||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||||
push(@INC,"${dir}","${dir}../../crypto/perlasm");
|
push(@INC,"${dir}","${dir}../../crypto/perlasm");
|
||||||
@ -362,7 +369,7 @@ my ($mode,$opcode) = @_;
|
|||||||
&ret ();
|
&ret ();
|
||||||
&function_end_B("padlock_sha1_oneshot");
|
&function_end_B("padlock_sha1_oneshot");
|
||||||
|
|
||||||
&function_begin_B("padlock_sha1");
|
&function_begin_B("padlock_sha1_blocks");
|
||||||
&push ("edi");
|
&push ("edi");
|
||||||
&push ("esi");
|
&push ("esi");
|
||||||
&mov ("eax",-1);
|
&mov ("eax",-1);
|
||||||
@ -373,7 +380,7 @@ my ($mode,$opcode) = @_;
|
|||||||
&pop ("esi");
|
&pop ("esi");
|
||||||
&pop ("edi");
|
&pop ("edi");
|
||||||
&ret ();
|
&ret ();
|
||||||
&function_end_B("padlock_sha1");
|
&function_end_B("padlock_sha1_blocks");
|
||||||
|
|
||||||
&function_begin_B("padlock_sha256_oneshot");
|
&function_begin_B("padlock_sha256_oneshot");
|
||||||
&push ("edi");
|
&push ("edi");
|
||||||
@ -397,7 +404,7 @@ my ($mode,$opcode) = @_;
|
|||||||
&ret ();
|
&ret ();
|
||||||
&function_end_B("padlock_sha256_oneshot");
|
&function_end_B("padlock_sha256_oneshot");
|
||||||
|
|
||||||
&function_begin_B("padlock_sha256");
|
&function_begin_B("padlock_sha256_blocks");
|
||||||
&push ("edi");
|
&push ("edi");
|
||||||
&push ("esi");
|
&push ("esi");
|
||||||
&mov ("eax",-1);
|
&mov ("eax",-1);
|
||||||
@ -408,7 +415,19 @@ my ($mode,$opcode) = @_;
|
|||||||
&pop ("esi");
|
&pop ("esi");
|
||||||
&pop ("edi");
|
&pop ("edi");
|
||||||
&ret ();
|
&ret ();
|
||||||
&function_end_B("padlock_sha256");
|
&function_end_B("padlock_sha256_blocks");
|
||||||
|
|
||||||
|
&function_begin_B("padlock_sha512_blocks");
|
||||||
|
&push ("edi");
|
||||||
|
&push ("esi");
|
||||||
|
&mov ("edi",&wparam(0));
|
||||||
|
&mov ("esi",&wparam(1));
|
||||||
|
&mov ("ecx",&wparam(2));
|
||||||
|
&data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512
|
||||||
|
&pop ("esi");
|
||||||
|
&pop ("edi");
|
||||||
|
&ret ();
|
||||||
|
&function_end_B("padlock_sha512_blocks");
|
||||||
|
|
||||||
&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
|
&asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
|
||||||
&align (16);
|
&align (16);
|
||||||
@ -417,7 +436,7 @@ my ($mode,$opcode) = @_;
|
|||||||
# Essentially this variable belongs in thread local storage.
|
# Essentially this variable belongs in thread local storage.
|
||||||
# Having this variable global on the other hand can only cause
|
# Having this variable global on the other hand can only cause
|
||||||
# few bogus key reloads [if any at all on signle-CPU system],
|
# few bogus key reloads [if any at all on signle-CPU system],
|
||||||
# so we accept the panalty...
|
# so we accept the penalty...
|
||||||
&set_label("padlock_saved_context",4);
|
&set_label("padlock_saved_context",4);
|
||||||
&data_word(0);
|
&data_word(0);
|
||||||
|
|
||||||
|
@ -151,15 +151,15 @@ padlock_sha1_oneshot:
|
|||||||
ret
|
ret
|
||||||
.size padlock_sha1_oneshot,.-padlock_sha1_oneshot
|
.size padlock_sha1_oneshot,.-padlock_sha1_oneshot
|
||||||
|
|
||||||
.globl padlock_sha1
|
.globl padlock_sha1_blocks
|
||||||
.type padlock_sha1,\@function,3
|
.type padlock_sha1_blocks,\@function,3
|
||||||
.align 16
|
.align 16
|
||||||
padlock_sha1:
|
padlock_sha1_blocks:
|
||||||
mov \$-1,%rax
|
mov \$-1,%rax
|
||||||
mov %rdx,%rcx
|
mov %rdx,%rcx
|
||||||
.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
|
.byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
|
||||||
ret
|
ret
|
||||||
.size padlock_sha1,.-padlock_sha1
|
.size padlock_sha1_blocks,.-padlock_sha1_blocks
|
||||||
|
|
||||||
.globl padlock_sha256_oneshot
|
.globl padlock_sha256_oneshot
|
||||||
.type padlock_sha256_oneshot,\@function,3
|
.type padlock_sha256_oneshot,\@function,3
|
||||||
@ -171,15 +171,23 @@ padlock_sha256_oneshot:
|
|||||||
ret
|
ret
|
||||||
.size padlock_sha256_oneshot,.-padlock_sha256_oneshot
|
.size padlock_sha256_oneshot,.-padlock_sha256_oneshot
|
||||||
|
|
||||||
.globl padlock_sha256
|
.globl padlock_sha256_blocks
|
||||||
.type padlock_sha256,\@function,3
|
.type padlock_sha256_blocks,\@function,3
|
||||||
.align 16
|
.align 16
|
||||||
padlock_sha256:
|
padlock_sha256_blocks:
|
||||||
mov \$-1,%rax
|
mov \$-1,%rax
|
||||||
mov %rdx,%rcx
|
mov %rdx,%rcx
|
||||||
.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
|
.byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
|
||||||
ret
|
ret
|
||||||
.size padlock_sha256,.-padlock_sha256
|
.size padlock_sha256_blocks,.-padlock_sha256_blocks
|
||||||
|
|
||||||
|
.globl padlock_sha512_blocks,\@function,3
|
||||||
|
.align 16
|
||||||
|
padlock_sha512_blocks:
|
||||||
|
mov %rdx,%rcx
|
||||||
|
.byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
|
||||||
|
ret
|
||||||
|
.size padlock_sha512_blocks,.-padlock_sha512_blocks
|
||||||
___
|
___
|
||||||
|
|
||||||
sub generate_mode {
|
sub generate_mode {
|
||||||
@ -207,6 +215,7 @@ padlock_${mode}_encrypt:
|
|||||||
xor %eax,%eax
|
xor %eax,%eax
|
||||||
xor %ebx,%ebx
|
xor %ebx,%ebx
|
||||||
testl \$`1<<5`,($ctx) # align bit in control word
|
testl \$`1<<5`,($ctx) # align bit in control word
|
||||||
|
jnz .L${mode}_aligned
|
||||||
test \$0x0f,$out
|
test \$0x0f,$out
|
||||||
setz %al # !out_misaligned
|
setz %al # !out_misaligned
|
||||||
test \$0x0f,$inp
|
test \$0x0f,$inp
|
||||||
|
Loading…
x
Reference in New Issue
Block a user