aes/asm/aesni-*.pl: fix CCM and further optimize it.

modes/ccm128.c: minor branch optimization.
This commit is contained in:
Andy Polyakov 2011-08-07 17:47:56 +00:00
parent 8a8cc84f74
commit 267b481c47
3 changed files with 83 additions and 73 deletions

View File

@ -594,6 +594,7 @@ if ($PREFIX eq "aesni") {
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
&movdqu ($cmac,&QWP(0,$rounds)); # load cmac &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
&mov ($rounds,&DWP(240,$key));
# compose byte-swap control mask for pshufb on stack # compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(0,"esp"),0x0c0d0e0f);
@ -602,34 +603,29 @@ if ($PREFIX eq "aesni") {
&mov (&DWP(12,"esp"),0x00010203); &mov (&DWP(12,"esp"),0x00010203);
# compose counter increment vector on stack # compose counter increment vector on stack
&mov ($rounds,1); &mov ($rounds_,1);
&xor ($key_,$key_); &xor ($key_,$key_);
&mov (&DWP(16,"esp"),$rounds); &mov (&DWP(16,"esp"),$rounds_);
&mov (&DWP(20,"esp"),$key_); &mov (&DWP(20,"esp"),$key_);
&mov (&DWP(24,"esp"),$key_); &mov (&DWP(24,"esp"),$key_);
&mov (&DWP(28,"esp"),$key_); &mov (&DWP(28,"esp"),$key_);
&movdqa ($inout3,&QWP(0,"esp")); &shr ($rounds,1);
&pshufb ($ivec,$inout3); # keep iv in reverse order &lea ($key_,&DWP(0,$key));
&mov ($rounds,&DWP(240,$key));
&mov ($key_,$key);
&mov ($rounds_,$rounds);
&movdqa ($inout0,$ivec); &movdqa ($inout0,$ivec);
&mov ($rounds_,$rounds);
&movdqa ($inout3,&QWP(0,"esp"));
&set_label("ccm64_enc_outer"); &set_label("ccm64_enc_outer");
&movups ($in0,&QWP(0,$inp)); &$movekey ($rndkey0,&QWP(0,$key_));
&pshufb ($inout0,$inout3);
&mov ($key,$key_);
&mov ($rounds,$rounds_); &mov ($rounds,$rounds_);
&movups ($in0,&QWP(0,$inp));
&$movekey ($rndkey0,&QWP(0,$key));
&shr ($rounds,1);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($in0,$rndkey0);
&lea ($key,&DWP(32,$key));
&xorps ($inout0,$rndkey0); &xorps ($inout0,$rndkey0);
&xorps ($cmac,$in0); # cmac^=inp &$movekey ($rndkey1,&QWP(16,$key_));
&xorps ($rndkey0,$in0);
&lea ($key,&DWP(32,$key_));
&xorps ($cmac,$rndkey0); # cmac^=inp
&$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey0,&QWP(0,$key));
&set_label("ccm64_enc2_loop"); &set_label("ccm64_enc2_loop");
@ -642,18 +638,20 @@ if ($PREFIX eq "aesni") {
&aesenc ($cmac,$rndkey0); &aesenc ($cmac,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey0,&QWP(0,$key));
&jnz (&label("ccm64_enc2_loop")); &jnz (&label("ccm64_enc2_loop"));
&pshufb ($ivec,$inout3);
&aesenc ($inout0,$rndkey1); &aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1); &aesenc ($cmac,$rndkey1);
&paddq ($ivec,&QWP(16,"esp"));
&aesenclast ($inout0,$rndkey0); &aesenclast ($inout0,$rndkey0);
&aesenclast ($cmac,$rndkey0); &aesenclast ($cmac,$rndkey0);
&paddq ($ivec,&QWP(16,"esp"));
&dec ($len); &dec ($len);
&lea ($inp,&DWP(16,$inp)); &lea ($inp,&DWP(16,$inp));
&xorps ($in0,$inout0); # inp^=E(ivec) &xorps ($in0,$inout0); # inp^=E(ivec)
&movdqa ($inout0,$ivec); &movdqa ($inout0,$ivec);
&movups (&QWP(0,$out),$in0); &movups (&QWP(0,$out),$in0); # save output
&lea ($out,&DWP(16,$out)); &lea ($out,&DWP(16,$out));
&pshufb ($ivec,$inout3);
&jnz (&label("ccm64_enc_outer")); &jnz (&label("ccm64_enc_outer"));
&mov ("esp",&DWP(48,"esp")); &mov ("esp",&DWP(48,"esp"));
@ -675,6 +673,7 @@ if ($PREFIX eq "aesni") {
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
&movdqu ($cmac,&QWP(0,$rounds)); # load cmac &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
&mov ($rounds,&DWP(240,$key));
# compose byte-swap control mask for pshufb on stack # compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f); &mov (&DWP(0,"esp"),0x0c0d0e0f);
@ -683,46 +682,45 @@ if ($PREFIX eq "aesni") {
&mov (&DWP(12,"esp"),0x00010203); &mov (&DWP(12,"esp"),0x00010203);
# compose counter increment vector on stack # compose counter increment vector on stack
&mov ($rounds,1); &mov ($rounds_,1);
&xor ($key_,$key_); &xor ($key_,$key_);
&mov (&DWP(16,"esp"),$rounds); &mov (&DWP(16,"esp"),$rounds_);
&mov (&DWP(20,"esp"),$key_); &mov (&DWP(20,"esp"),$key_);
&mov (&DWP(24,"esp"),$key_); &mov (&DWP(24,"esp"),$key_);
&mov (&DWP(28,"esp"),$key_); &mov (&DWP(28,"esp"),$key_);
&movdqa ($inout3,&QWP(0,"esp")); # bswap mask &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
&movdqa ($inout0,$ivec); &movdqa ($inout0,$ivec);
&pshufb ($ivec,$inout3); # keep iv in reverse order
&mov ($rounds,&DWP(240,$key));
&mov ($key_,$key); &mov ($key_,$key);
&mov ($rounds_,$rounds); &mov ($rounds_,$rounds);
&pshufb ($ivec,$inout3);
if ($inline) if ($inline)
{ &aesni_inline_generate1("enc"); } { &aesni_inline_generate1("enc"); }
else else
{ &call ("_aesni_encrypt1"); } { &call ("_aesni_encrypt1"); }
&set_label("ccm64_dec_outer");
&paddq ($ivec,&QWP(16,"esp"));
&movups ($in0,&QWP(0,$inp)); # load inp &movups ($in0,&QWP(0,$inp)); # load inp
&xorps ($in0,$inout0); &paddq ($ivec,&QWP(16,"esp"));
&movdqa ($inout0,$ivec); &pshufb ($ivec,$inout3);
&lea ($inp,&QWP(16,$inp)); &lea ($inp,&QWP(16,$inp));
&pshufb ($inout0,$inout3); &jmp (&label("ccm64_dec_outer"));
&mov ($key,$key_);
&set_label("ccm64_dec_outer",16);
&xorps ($in0,$inout0); # inp ^= E(ivec)
&movdqa ($inout0,$ivec);
&mov ($rounds,$rounds_); &mov ($rounds,$rounds_);
&movups (&QWP(0,$out),$in0); &movups (&QWP(0,$out),$in0); # save output
&lea ($out,&DWP(16,$out)); &lea ($out,&DWP(16,$out));
&sub ($len,1); &sub ($len,1);
&jz (&label("ccm64_dec_break")); &jz (&label("ccm64_dec_break"));
&$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey0,&QWP(0,$key_));
&shr ($rounds,1); &shr ($rounds,1);
&$movekey ($rndkey1,&QWP(16,$key)); &$movekey ($rndkey1,&QWP(16,$key_));
&xorps ($in0,$rndkey0); &xorps ($in0,$rndkey0);
&lea ($key,&DWP(32,$key)); &lea ($key,&DWP(32,$key_));
&xorps ($inout0,$rndkey0); &xorps ($inout0,$rndkey0);
&xorps ($cmac,$in0); # cmac^=out &xorps ($cmac,$in0); # cmac^=out
&$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey0,&QWP(0,$key));
@ -737,13 +735,18 @@ if ($PREFIX eq "aesni") {
&aesenc ($cmac,$rndkey0); &aesenc ($cmac,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key)); &$movekey ($rndkey0,&QWP(0,$key));
&jnz (&label("ccm64_dec2_loop")); &jnz (&label("ccm64_dec2_loop"));
&movups ($in0,&QWP(0,$inp)); # load inp
&paddq ($ivec,&QWP(16,"esp"));
&aesenc ($inout0,$rndkey1); &aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1); &aesenc ($cmac,$rndkey1);
&pshufb ($ivec,$inout3);
&lea ($inp,&QWP(16,$inp));
&aesenclast ($inout0,$rndkey0); &aesenclast ($inout0,$rndkey0);
&aesenclast ($cmac,$rndkey0); &aesenclast ($cmac,$rndkey0);
&jmp (&label("ccm64_dec_outer")); &jmp (&label("ccm64_dec_outer"));
&set_label("ccm64_dec_break",16); &set_label("ccm64_dec_break",16);
&mov ($key,$key_);
if ($inline) if ($inline)
{ &aesni_inline_generate1("enc",$cmac,$in0); } { &aesni_inline_generate1("enc",$cmac,$in0); }
else else

View File

@ -821,8 +821,8 @@ ___
{ {
my $cmac="%r9"; # 6th argument my $cmac="%r9"; # 6th argument
my $increment="%xmm8"; my $increment="%xmm6";
my $bswap_mask="%xmm9"; my $bswap_mask="%xmm7";
$code.=<<___; $code.=<<___;
.globl aesni_ccm64_encrypt_blocks .globl aesni_ccm64_encrypt_blocks
@ -839,30 +839,28 @@ $code.=<<___ if ($win64);
.Lccm64_enc_body: .Lccm64_enc_body:
___ ___
$code.=<<___; $code.=<<___;
mov 240($key),$rounds # key->rounds
movdqu ($ivp),$iv movdqu ($ivp),$iv
movdqu ($cmac),$inout1
movdqa .Lincrement64(%rip),$increment movdqa .Lincrement64(%rip),$increment
movdqa .Lbswap_mask(%rip),$bswap_mask movdqa .Lbswap_mask(%rip),$bswap_mask
pshufb $bswap_mask,$iv # keep iv in reverse order
mov 240($key),$rounds # key->rounds
mov $key,$key_
mov $rounds,$rnds_
movdqa $iv,$inout0
.Lccm64_enc_outer:
movups ($inp),$in0 # load inp
pshufb $bswap_mask,$inout0
mov $key_,$key
mov $rnds_,$rounds
$movkey ($key),$rndkey0
shr \$1,$rounds shr \$1,$rounds
$movkey 16($key),$rndkey1 lea 0($key),$key_
xorps $rndkey0,$in0 movdqu ($cmac),$inout1
lea 32($key),$key movdqa $iv,$inout0
xorps $rndkey0,$inout0 mov $rounds,$rnds_
xorps $inout1,$in0 # cmac^=inp jmp .Lccm64_enc_outer
.align 16
.Lccm64_enc_outer:
$movkey ($key_),$rndkey0
mov $rnds_,$rounds
movups ($inp),$in0 # load inp
xorps $rndkey0,$inout0 # counter
$movkey 16($key_),$rndkey1
xorps $in0,$rndkey0
lea 32($key_),$key
xorps $rndkey0,$inout1 # cmac^=inp
$movkey ($key),$rndkey0 $movkey ($key),$rndkey0
.Lccm64_enc2_loop: .Lccm64_enc2_loop:
@ -875,18 +873,20 @@ $code.=<<___;
aesenc $rndkey0,$inout1 aesenc $rndkey0,$inout1
$movkey 0($key),$rndkey0 $movkey 0($key),$rndkey0
jnz .Lccm64_enc2_loop jnz .Lccm64_enc2_loop
pshufb $bswap_mask,$iv
aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout1
paddq $increment,$iv
aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1 aesenclast $rndkey0,$inout1
paddq $increment,$iv
dec $len dec $len
lea 16($inp),$inp lea 16($inp),$inp
xorps $inout0,$in0 # inp ^= E(iv) xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0 movdqa $iv,$inout0
movups $in0,($out) # save output movups $in0,($out) # save output
lea 16($out),$out lea 16($out),$out
pshufb $bswap_mask,$iv
jnz .Lccm64_enc_outer jnz .Lccm64_enc_outer
movups $inout1,($cmac) movups $inout1,($cmac)
@ -919,39 +919,40 @@ $code.=<<___ if ($win64);
.Lccm64_dec_body: .Lccm64_dec_body:
___ ___
$code.=<<___; $code.=<<___;
movdqu ($ivp),$iv mov 240($key),$rounds # key->rounds
movups ($ivp),$iv
movdqu ($cmac),$inout1 movdqu ($cmac),$inout1
movdqa .Lincrement64(%rip),$increment movdqa .Lincrement64(%rip),$increment
movdqa .Lbswap_mask(%rip),$bswap_mask movdqa .Lbswap_mask(%rip),$bswap_mask
mov 240($key),$rounds # key->rounds movaps $iv,$inout0
movdqa $iv,$inout0
pshufb $bswap_mask,$iv # keep iv in reverse order
mov $rounds,$rnds_ mov $rounds,$rnds_
mov $key,$key_ mov $key,$key_
pshufb $bswap_mask,$iv
___ ___
&aesni_generate1("enc",$key,$rounds); &aesni_generate1("enc",$key,$rounds);
$code.=<<___; $code.=<<___;
.Lccm64_dec_outer:
paddq $increment,$iv
movups ($inp),$in0 # load inp movups ($inp),$in0 # load inp
xorps $inout0,$in0 paddq $increment,$iv
movdqa $iv,$inout0 pshufb $bswap_mask,$iv
lea 16($inp),$inp lea 16($inp),$inp
pshufb $bswap_mask,$inout0 jmp .Lccm64_dec_outer
mov $key_,$key .align 16
.Lccm64_dec_outer:
xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0
mov $rnds_,$rounds mov $rnds_,$rounds
movups $in0,($out) movups $in0,($out) # save output
lea 16($out),$out lea 16($out),$out
sub \$1,$len sub \$1,$len
jz .Lccm64_dec_break jz .Lccm64_dec_break
$movkey ($key),$rndkey0 $movkey ($key_),$rndkey0
shr \$1,$rounds shr \$1,$rounds
$movkey 16($key),$rndkey1 $movkey 16($key_),$rndkey1
xorps $rndkey0,$in0 xorps $rndkey0,$in0
lea 32($key),$key lea 32($key_),$key
xorps $rndkey0,$inout0 xorps $rndkey0,$inout0
xorps $in0,$inout1 # cmac^=out xorps $in0,$inout1 # cmac^=out
$movkey ($key),$rndkey0 $movkey ($key),$rndkey0
@ -966,15 +967,21 @@ $code.=<<___;
aesenc $rndkey0,$inout1 aesenc $rndkey0,$inout1
$movkey 0($key),$rndkey0 $movkey 0($key),$rndkey0
jnz .Lccm64_dec2_loop jnz .Lccm64_dec2_loop
movups ($inp),$in0 # load inp
paddq $increment,$iv
aesenc $rndkey1,$inout0 aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1 aesenc $rndkey1,$inout1
pshufb $bswap_mask,$iv
lea 16($inp),$inp
aesenclast $rndkey0,$inout0 aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1
jmp .Lccm64_dec_outer jmp .Lccm64_dec_outer
.align 16 .align 16
.Lccm64_dec_break: .Lccm64_dec_break:
#xorps $in0,$inout1 # cmac^=out
___ ___
&aesni_generate1("enc",$key,$rounds,$inout1); &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
$code.=<<___; $code.=<<___;
movups $inout1,($cmac) movups $inout1,($cmac)
___ ___

View File

@ -356,10 +356,10 @@ int CRYPTO_ccm128_encrypt_ccm64(CCM128_CONTEXT *ctx,
inp += n; inp += n;
out += n; out += n;
len -= n; len -= n;
if (len) ctr64_add(ctx->nonce.c,n/16);
} }
if (len) { if (len) {
if (n) ctr64_add(ctx->nonce.c,n/16);
for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i]; for (i=0; i<len; ++i) ctx->cmac.c[i] ^= inp[i];
(*block)(ctx->cmac.c,ctx->cmac.c,key); (*block)(ctx->cmac.c,ctx->cmac.c,key);
(*block)(ctx->nonce.c,scratch.c,key); (*block)(ctx->nonce.c,scratch.c,key);
@ -409,10 +409,10 @@ int CRYPTO_ccm128_decrypt_ccm64(CCM128_CONTEXT *ctx,
inp += n; inp += n;
out += n; out += n;
len -= n; len -= n;
if (len) ctr64_add(ctx->nonce.c,n/16);
} }
if (len) { if (len) {
if (n) ctr64_add(ctx->nonce.c,n/16);
(*block)(ctx->nonce.c,scratch.c,key); (*block)(ctx->nonce.c,scratch.c,key);
for (i=0; i<len; ++i) for (i=0; i<len; ++i)
ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]); ctx->cmac.c[i] ^= (out[i] = scratch.c[i]^inp[i]);