GCM "jumbo" update:
- gcm128.c: support for Intel PCLMULQDQ, readability improvements; - asm/ghash-x86.pl: splitted vanilla, MMX, PCLMULQDQ subroutines; - asm/ghash-x86_64.pl: add PCLMULQDQ implementations.
This commit is contained in:
parent
ea7239cf15
commit
c1f092d14e
@ -23,7 +23,7 @@
|
||||
# PIII 63 /77 16 24
|
||||
# P4 96 /122 30 84(***)
|
||||
# Opteron 50 /71 21 30
|
||||
# Core2 54 /68 13 18
|
||||
# Core2 54 /68 12.5 18
|
||||
#
|
||||
# (*) gcc 3.4.x was observed to generate few percent slower code,
|
||||
# which is one of reasons why 2.95.3 results were chosen,
|
||||
@ -33,124 +33,84 @@
|
||||
# position-independent;
|
||||
# (***) see comment in non-MMX routine for further details;
|
||||
#
|
||||
# To summarize, it's 2-3 times faster than gcc-generated code. To
|
||||
# To summarize, it's >2-3 times faster than gcc-generated code. To
|
||||
# anchor it to something else SHA1 assembler processes one byte in
|
||||
# 11-13 cycles on contemporary x86 cores.
|
||||
|
||||
# May 2010
|
||||
#
|
||||
# Add PCLMULQDQ version performing at 2.13 cycles per processed byte.
|
||||
# The question is how close is it to theoretical limit? The pclmulqdq
|
||||
# instruction latency appears to be 14 cycles and there can't be more
|
||||
# than 2 of them executing at any given time. This means that single
|
||||
# Karatsuba multiplication would take 28 cycles *plus* few cycles for
|
||||
# pre- and post-processing. Then multiplication has to be followed by
|
||||
# modulo-reduction. Given that aggregated reduction method [see
|
||||
# "Carry-less Multiplication and Its Usage for Computing the GCM Mode"
|
||||
# white paper by Intel] allows you to perform reduction only once in
|
||||
# a while we can assume that asymptotic performance can be estimated
|
||||
# as (28+Tmod/Naggr)/16, where Tmod is time to perform reduction
|
||||
# and Naggr is the aggregation factor.
|
||||
#
|
||||
# Before we proceed to this implementation let's have closer look at
|
||||
# the best-performing code suggested by Intel in their white paper.
|
||||
# By tracing inter-register dependencies Tmod is estimated as ~19
|
||||
# cycles and Naggr is 4, resulting in 2.05 cycles per processed byte.
|
||||
# As implied, this is quite optimistic estimate, because it does not
|
||||
# account for Karatsuba pre- and post-processing, which for a single
|
||||
# multiplication is ~5 cycles. Unfortunately Intel does not provide
|
||||
# performance data for GHASH alone, only for fused GCM mode. But
|
||||
# we can estimate it by subtracting CTR performance result provided
|
||||
# in "AES Instruction Set" white paper: 3.54-1.38=2.16 cycles per
|
||||
# processed byte or 5% off the estimate. It should be noted though
|
||||
# that 3.54 is GCM result for 16KB block size, while 1.38 is CTR for
|
||||
# 1KB block size, meaning that real number is likely to be a bit
|
||||
# further from estimate.
|
||||
#
|
||||
# Moving on to the implementation in question. Tmod is estimated as
|
||||
# ~13 cycles and Naggr is 2, giving asymptotic performance of ...
|
||||
# 2.16. How is it possible that measured performance is better than
|
||||
# optimistic theoretical estimate? There is one thing Intel failed
|
||||
# to recognize. By fusing GHASH with CTR former's performance is
|
||||
# really limited to above (Tmul + Tmod/Naggr) equation. But if GHASH
|
||||
# procedure is detached, the modulo-reduction can be interleaved with
|
||||
# Naggr-1 multiplications and under ideal conditions even disappear
|
||||
# from the equation. So that optimistic theoretical estimate for this
|
||||
# implementation is ... 28/16=1.75, and not 2.16. Well, it's probably
|
||||
# way too optimistic, at least for such small Naggr. I'd argue that
|
||||
# (28+Tproc/Naggr), where Tproc is time required for Karatsuba pre-
|
||||
# and post-processing, is more realistic estimate. In this case it
|
||||
# gives ... 1.91 cycles per processed byte. Or in other words,
|
||||
# depending on how well we can interleave reduction and one of the
|
||||
# two multiplications the performance should be betwen 1.91 and 2.16.
|
||||
# As already mentioned, this implementation processes one byte [out
|
||||
# of 1KB buffer] in 2.13 cycles, while x86_64 counterpart - in 2.07.
|
||||
# x86_64 performance is better, because larger register bank allows
|
||||
# to interleave reduction and multiplication better.
|
||||
#
|
||||
# Does it make sense to increase Naggr? To start with it's virtually
|
||||
# impossible in 32-bit mode, because of limited register bank
|
||||
# capacity. Otherwise improvement has to be weighed agiainst slower
|
||||
# setup, as well as code size and complexity increase. As even
|
||||
# optimistic estimate doesn't promise 30% performance improvement,
|
||||
# there are currently no plans to increase Naggr.
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
||||
&asm_init($ARGV[0],"gcm-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
|
||||
&asm_init($ARGV[0],"ghash-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
|
||||
|
||||
&static_label("rem_4bit") if (!$x86only);
|
||||
$sse2=0;
|
||||
for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
||||
|
||||
$Zhh = "ebp";
|
||||
$Zhl = "edx";
|
||||
$Zlh = "ecx";
|
||||
$Zll = "ebx";
|
||||
($Zhh,$Zhl,$Zlh,$Zll) = ("ebp","edx","ecx","ebx");
|
||||
$inp = "edi";
|
||||
$Htbl = "esi";
|
||||
|
||||
|
||||
$unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse
|
||||
# than unrolled, which has to be weighted against
|
||||
# 1.7x code size reduction. Well, *overall* 1.7x,
|
||||
# x86-specific code itself shrinks by 2.5x...
|
||||
|
||||
sub mmx_loop() {
|
||||
# MMX version performs 2.8 times better on P4 (see comment in non-MMX
|
||||
# routine for further details), 40% better on Opteron, 50% better
|
||||
# on PIII and Core2... In other words effort is considered to be well
|
||||
# spent...
|
||||
my $inp = shift;
|
||||
my $rem_4bit = shift;
|
||||
my $cnt = $Zhh;
|
||||
my $nhi = $Zhl;
|
||||
my $nlo = $Zlh;
|
||||
my $rem = $Zll;
|
||||
|
||||
my $Zlo = "mm0";
|
||||
my $Zhi = "mm1";
|
||||
my $tmp = "mm2";
|
||||
|
||||
&xor ($nlo,$nlo); # avoid partial register stalls on PIII
|
||||
&mov ($nhi,$Zll);
|
||||
&mov (&LB($nlo),&LB($nhi));
|
||||
&mov ($cnt,14);
|
||||
&shl (&LB($nlo),4);
|
||||
&and ($nhi,0xf0);
|
||||
&movq ($Zlo,&QWP(8,$Htbl,$nlo));
|
||||
&movq ($Zhi,&QWP(0,$Htbl,$nlo));
|
||||
&movd ($rem,$Zlo);
|
||||
&jmp (&label("mmx_loop"));
|
||||
|
||||
&set_label("mmx_loop",16);
|
||||
&psrlq ($Zlo,4);
|
||||
&and ($rem,0xf);
|
||||
&pxor ($Zlo,&QWP(8,$Htbl,$nhi));
|
||||
&movq ($tmp,$Zhi);
|
||||
&psrlq ($Zhi,4);
|
||||
&mov (&LB($nlo),&BP(0,$inp,$cnt));
|
||||
&dec ($cnt);
|
||||
&psllq ($tmp,60);
|
||||
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
|
||||
&movd ($rem,$Zlo);
|
||||
&pxor ($Zhi,&QWP(0,$Htbl,$nhi));
|
||||
&mov ($nhi,$nlo);
|
||||
&pxor ($Zlo,$tmp);
|
||||
&js (&label("mmx_break"));
|
||||
|
||||
&shl (&LB($nlo),4);
|
||||
&and ($rem,0xf);
|
||||
&psrlq ($Zlo,4);
|
||||
&and ($nhi,0xf0);
|
||||
&movq ($tmp,$Zhi);
|
||||
&psrlq ($Zhi,4);
|
||||
&pxor ($Zlo,&QWP(8,$Htbl,$nlo));
|
||||
&psllq ($tmp,60);
|
||||
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
|
||||
&movd ($rem,$Zlo);
|
||||
&pxor ($Zhi,&QWP(0,$Htbl,$nlo));
|
||||
&pxor ($Zlo,$tmp);
|
||||
&jmp (&label("mmx_loop"));
|
||||
|
||||
&set_label("mmx_break",16);
|
||||
&shl (&LB($nlo),4);
|
||||
&and ($rem,0xf);
|
||||
&psrlq ($Zlo,4);
|
||||
&and ($nhi,0xf0);
|
||||
&movq ($tmp,$Zhi);
|
||||
&psrlq ($Zhi,4);
|
||||
&pxor ($Zlo,&QWP(8,$Htbl,$nlo));
|
||||
&psllq ($tmp,60);
|
||||
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
|
||||
&movd ($rem,$Zlo);
|
||||
&pxor ($Zhi,&QWP(0,$Htbl,$nlo));
|
||||
&pxor ($Zlo,$tmp);
|
||||
|
||||
&psrlq ($Zlo,4);
|
||||
&and ($rem,0xf);
|
||||
&pxor ($Zlo,&QWP(8,$Htbl,$nhi));
|
||||
&movq ($tmp,$Zhi);
|
||||
&psrlq ($Zhi,4);
|
||||
&psllq ($tmp,60);
|
||||
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
|
||||
&movd ($rem,$Zlo);
|
||||
&pxor ($Zhi,&QWP(0,$Htbl,$nhi));
|
||||
&mov ($nhi,$nlo);
|
||||
&pxor ($Zlo,$tmp);
|
||||
|
||||
&psrlq ($Zlo,32); # lower part of Zlo is already there
|
||||
&movd ($Zhl,$Zhi);
|
||||
&psrlq ($Zhi,32);
|
||||
&movd ($Zlh,$Zlo);
|
||||
&movd ($Zhh,$Zhi);
|
||||
|
||||
&bswap ($Zll);
|
||||
&bswap ($Zhl);
|
||||
&bswap ($Zlh);
|
||||
&bswap ($Zhh);
|
||||
}
|
||||
# 2.5x x86-specific code size reduction.
|
||||
|
||||
sub x86_loop {
|
||||
my $off = shift;
|
||||
@ -245,33 +205,30 @@ if ($unroll) {
|
||||
&function_end_B("_x86_gmult_4bit_inner");
|
||||
}
|
||||
|
||||
&function_begin("gcm_gmult_4bit");
|
||||
if (!$x86only) {
|
||||
&call (&label("pic_point"));
|
||||
&set_label("pic_point");
|
||||
&blindpop("eax");
|
||||
&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
|
||||
&bt (&DWP(0,"ebp"),23); # check for MMX bit
|
||||
&jnc (&label("x86"));
|
||||
sub deposit_rem_4bit {
|
||||
my $bias = shift;
|
||||
|
||||
&lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
|
||||
&mov (&DWP($bias+0, "esp"),0x0000<<16);
|
||||
&mov (&DWP($bias+4, "esp"),0x1C20<<16);
|
||||
&mov (&DWP($bias+8, "esp"),0x3840<<16);
|
||||
&mov (&DWP($bias+12,"esp"),0x2460<<16);
|
||||
&mov (&DWP($bias+16,"esp"),0x7080<<16);
|
||||
&mov (&DWP($bias+20,"esp"),0x6CA0<<16);
|
||||
&mov (&DWP($bias+24,"esp"),0x48C0<<16);
|
||||
&mov (&DWP($bias+28,"esp"),0x54E0<<16);
|
||||
&mov (&DWP($bias+32,"esp"),0xE100<<16);
|
||||
&mov (&DWP($bias+36,"esp"),0xFD20<<16);
|
||||
&mov (&DWP($bias+40,"esp"),0xD940<<16);
|
||||
&mov (&DWP($bias+44,"esp"),0xC560<<16);
|
||||
&mov (&DWP($bias+48,"esp"),0x9180<<16);
|
||||
&mov (&DWP($bias+52,"esp"),0x8DA0<<16);
|
||||
&mov (&DWP($bias+56,"esp"),0xA9C0<<16);
|
||||
&mov (&DWP($bias+60,"esp"),0xB5E0<<16);
|
||||
}
|
||||
|
||||
$suffix = $x86only ? "" : "_x86";
|
||||
|
||||
&mov ($inp,&wparam(0)); # load Xi
|
||||
&mov ($Htbl,&wparam(1)); # load Htable
|
||||
|
||||
&movz ($Zll,&BP(15,$inp));
|
||||
|
||||
&mmx_loop($inp,"eax");
|
||||
|
||||
&emms ();
|
||||
&mov (&DWP(12,$inp),$Zll);
|
||||
&mov (&DWP(4,$inp),$Zhl);
|
||||
&mov (&DWP(8,$inp),$Zlh);
|
||||
&mov (&DWP(0,$inp),$Zhh);
|
||||
|
||||
&function_end_A();
|
||||
&set_label("x86",16);
|
||||
}
|
||||
&function_begin("gcm_gmult_4bit".$suffix);
|
||||
&stack_push(16+4+1); # +1 for stack alignment
|
||||
&mov ($inp,&wparam(0)); # load Xi
|
||||
&mov ($Htbl,&wparam(1)); # load Htable
|
||||
@ -302,62 +259,9 @@ if ($unroll) {
|
||||
&mov (&DWP(4,$inp),$Zhl);
|
||||
&mov (&DWP(0,$inp),$Zhh);
|
||||
&stack_pop(16+4+1);
|
||||
&function_end("gcm_gmult_4bit");
|
||||
&function_end("gcm_gmult_4bit".$suffix);
|
||||
|
||||
# Streamed version performs 20% better on P4, 7% on Opteron,
|
||||
# 10% on Core2 and PIII...
|
||||
&function_begin("gcm_ghash_4bit");
|
||||
if (!$x86only) {
|
||||
&call (&label("pic_point"));
|
||||
&set_label("pic_point");
|
||||
&blindpop("eax");
|
||||
&picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point"));
|
||||
&bt (&DWP(0,"ebp"),23); # check for MMX bit
|
||||
&jnc (&label("x86"));
|
||||
|
||||
&lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
|
||||
|
||||
&mov ($Zhh,&wparam(0)); # load Xi
|
||||
&mov ($Htbl,&wparam(1)); # load Htable
|
||||
&mov ($inp,&wparam(2)); # load in
|
||||
&mov ($Zlh,&wparam(3)); # load len
|
||||
&add ($Zlh,$inp);
|
||||
&mov (&wparam(3),$Zlh); # len to point at the end of input
|
||||
&stack_push(4+1); # +1 for stack alignment
|
||||
&mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
|
||||
&mov ($Zhl,&DWP(4,$Zhh));
|
||||
&mov ($Zlh,&DWP(8,$Zhh));
|
||||
&mov ($Zhh,&DWP(0,$Zhh));
|
||||
|
||||
&set_label("mmx_outer_loop",16);
|
||||
&xor ($Zll,&DWP(12,$inp));
|
||||
&xor ($Zhl,&DWP(4,$inp));
|
||||
&xor ($Zlh,&DWP(8,$inp));
|
||||
&xor ($Zhh,&DWP(0,$inp));
|
||||
&mov (&DWP(12,"esp"),$Zll);
|
||||
&mov (&DWP(4,"esp"),$Zhl);
|
||||
&mov (&DWP(8,"esp"),$Zlh);
|
||||
&mov (&DWP(0,"esp"),$Zhh);
|
||||
|
||||
&shr ($Zll,24);
|
||||
|
||||
&mmx_loop("esp","eax");
|
||||
|
||||
&lea ($inp,&DWP(16,$inp));
|
||||
&cmp ($inp,&wparam(3));
|
||||
&jb (&label("mmx_outer_loop"));
|
||||
|
||||
&mov ($inp,&wparam(0)); # load Xi
|
||||
&emms ();
|
||||
&mov (&DWP(12,$inp),$Zll);
|
||||
&mov (&DWP(4,$inp),$Zhl);
|
||||
&mov (&DWP(8,$inp),$Zlh);
|
||||
&mov (&DWP(0,$inp),$Zhh);
|
||||
|
||||
&stack_pop(4+1);
|
||||
&function_end_A();
|
||||
&set_label("x86",16);
|
||||
}
|
||||
&function_begin("gcm_ghash_4bit".$suffix);
|
||||
&stack_push(16+4+1); # +1 for 64-bit alignment
|
||||
&mov ($Zll,&wparam(0)); # load Xi
|
||||
&mov ($Htbl,&wparam(1)); # load Htable
|
||||
@ -403,35 +307,652 @@ if ($unroll) {
|
||||
&mov (&DWP(4,$inp),$Zhl);
|
||||
&mov (&DWP(0,$inp),$Zhh);
|
||||
&stack_pop(16+4+1);
|
||||
&function_end("gcm_ghash_4bit");
|
||||
&function_end("gcm_ghash_4bit".$suffix);
|
||||
|
||||
if (!$x86only) {{{
|
||||
|
||||
sub deposit_rem_4bit {
|
||||
my $bias = shift;
|
||||
&static_label("rem_4bit");
|
||||
|
||||
&mov (&DWP($bias+0, "esp"),0x0000<<16);
|
||||
&mov (&DWP($bias+4, "esp"),0x1C20<<16);
|
||||
&mov (&DWP($bias+8, "esp"),0x3840<<16);
|
||||
&mov (&DWP($bias+12,"esp"),0x2460<<16);
|
||||
&mov (&DWP($bias+16,"esp"),0x7080<<16);
|
||||
&mov (&DWP($bias+20,"esp"),0x6CA0<<16);
|
||||
&mov (&DWP($bias+24,"esp"),0x48C0<<16);
|
||||
&mov (&DWP($bias+28,"esp"),0x54E0<<16);
|
||||
&mov (&DWP($bias+32,"esp"),0xE100<<16);
|
||||
&mov (&DWP($bias+36,"esp"),0xFD20<<16);
|
||||
&mov (&DWP($bias+40,"esp"),0xD940<<16);
|
||||
&mov (&DWP($bias+44,"esp"),0xC560<<16);
|
||||
&mov (&DWP($bias+48,"esp"),0x9180<<16);
|
||||
&mov (&DWP($bias+52,"esp"),0x8DA0<<16);
|
||||
&mov (&DWP($bias+56,"esp"),0xA9C0<<16);
|
||||
&mov (&DWP($bias+60,"esp"),0xB5E0<<16);
|
||||
sub mmx_loop() {
|
||||
# MMX version performs 2.8 times better on P4 (see comment in non-MMX
|
||||
# routine for further details), 40% better on Opteron and Core2, 50%
|
||||
# better on PIII... In other words effort is considered to be well
|
||||
# spent...
|
||||
my $inp = shift;
|
||||
my $rem_4bit = shift;
|
||||
my $cnt = $Zhh;
|
||||
my $nhi = $Zhl;
|
||||
my $nlo = $Zlh;
|
||||
my $rem = $Zll;
|
||||
|
||||
my ($Zlo,$Zhi) = ("mm0","mm1");
|
||||
my $tmp = "mm2";
|
||||
|
||||
&xor ($nlo,$nlo); # avoid partial register stalls on PIII
|
||||
&mov ($nhi,$Zll);
|
||||
&mov (&LB($nlo),&LB($nhi));
|
||||
&mov ($cnt,14);
|
||||
&shl (&LB($nlo),4);
|
||||
&and ($nhi,0xf0);
|
||||
&movq ($Zlo,&QWP(8,$Htbl,$nlo));
|
||||
&movq ($Zhi,&QWP(0,$Htbl,$nlo));
|
||||
&movd ($rem,$Zlo);
|
||||
&jmp (&label("mmx_loop"));
|
||||
|
||||
&set_label("mmx_loop",16);
|
||||
&psrlq ($Zlo,4);
|
||||
&and ($rem,0xf);
|
||||
&movq ($tmp,$Zhi);
|
||||
&psrlq ($Zhi,4);
|
||||
&pxor ($Zlo,&QWP(8,$Htbl,$nhi));
|
||||
&mov (&LB($nlo),&BP(0,$inp,$cnt));
|
||||
&psllq ($tmp,60);
|
||||
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
|
||||
&dec ($cnt);
|
||||
&movd ($rem,$Zlo);
|
||||
&pxor ($Zhi,&QWP(0,$Htbl,$nhi));
|
||||
&mov ($nhi,$nlo);
|
||||
&pxor ($Zlo,$tmp);
|
||||
&js (&label("mmx_break"));
|
||||
|
||||
&shl (&LB($nlo),4);
|
||||
&and ($rem,0xf);
|
||||
&psrlq ($Zlo,4);
|
||||
&and ($nhi,0xf0);
|
||||
&movq ($tmp,$Zhi);
|
||||
&psrlq ($Zhi,4);
|
||||
&pxor ($Zlo,&QWP(8,$Htbl,$nlo));
|
||||
&psllq ($tmp,60);
|
||||
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
|
||||
&movd ($rem,$Zlo);
|
||||
&pxor ($Zhi,&QWP(0,$Htbl,$nlo));
|
||||
&pxor ($Zlo,$tmp);
|
||||
&jmp (&label("mmx_loop"));
|
||||
|
||||
&set_label("mmx_break",16);
|
||||
&shl (&LB($nlo),4);
|
||||
&and ($rem,0xf);
|
||||
&psrlq ($Zlo,4);
|
||||
&and ($nhi,0xf0);
|
||||
&movq ($tmp,$Zhi);
|
||||
&psrlq ($Zhi,4);
|
||||
&pxor ($Zlo,&QWP(8,$Htbl,$nlo));
|
||||
&psllq ($tmp,60);
|
||||
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
|
||||
&movd ($rem,$Zlo);
|
||||
&pxor ($Zhi,&QWP(0,$Htbl,$nlo));
|
||||
&pxor ($Zlo,$tmp);
|
||||
|
||||
&psrlq ($Zlo,4);
|
||||
&and ($rem,0xf);
|
||||
&movq ($tmp,$Zhi);
|
||||
&psrlq ($Zhi,4);
|
||||
&pxor ($Zlo,&QWP(8,$Htbl,$nhi));
|
||||
&psllq ($tmp,60);
|
||||
&pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8));
|
||||
&movd ($rem,$Zlo);
|
||||
&pxor ($Zhi,&QWP(0,$Htbl,$nhi));
|
||||
&pxor ($Zlo,$tmp);
|
||||
|
||||
&psrlq ($Zlo,32); # lower part of Zlo is already there
|
||||
&movd ($Zhl,$Zhi);
|
||||
&psrlq ($Zhi,32);
|
||||
&movd ($Zlh,$Zlo);
|
||||
&movd ($Zhh,$Zhi);
|
||||
|
||||
&bswap ($Zll);
|
||||
&bswap ($Zhl);
|
||||
&bswap ($Zlh);
|
||||
&bswap ($Zhh);
|
||||
}
|
||||
|
||||
if (!$x86only) {
|
||||
&function_begin("gcm_gmult_4bit_mmx");
|
||||
&mov ($inp,&wparam(0)); # load Xi
|
||||
&mov ($Htbl,&wparam(1)); # load Htable
|
||||
|
||||
&call (&label("pic_point"));
|
||||
&set_label("pic_point");
|
||||
&blindpop("eax");
|
||||
&lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
|
||||
|
||||
&movz ($Zll,&BP(15,$inp));
|
||||
|
||||
&mmx_loop($inp,"eax");
|
||||
|
||||
&emms ();
|
||||
&mov (&DWP(12,$inp),$Zll);
|
||||
&mov (&DWP(4,$inp),$Zhl);
|
||||
&mov (&DWP(8,$inp),$Zlh);
|
||||
&mov (&DWP(0,$inp),$Zhh);
|
||||
&function_end("gcm_gmult_4bit_mmx");
|
||||
|
||||
# Streamed version performs 20% better on P4, 7% on Opteron,
|
||||
# 10% on Core2 and PIII...
|
||||
&function_begin("gcm_ghash_4bit_mmx");
|
||||
&mov ($Zhh,&wparam(0)); # load Xi
|
||||
&mov ($Htbl,&wparam(1)); # load Htable
|
||||
&mov ($inp,&wparam(2)); # load in
|
||||
&mov ($Zlh,&wparam(3)); # load len
|
||||
|
||||
&call (&label("pic_point"));
|
||||
&set_label("pic_point");
|
||||
&blindpop("eax");
|
||||
&lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
|
||||
|
||||
&add ($Zlh,$inp);
|
||||
&mov (&wparam(3),$Zlh); # len to point at the end of input
|
||||
&stack_push(4+1); # +1 for stack alignment
|
||||
|
||||
&mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
|
||||
&mov ($Zhl,&DWP(4,$Zhh));
|
||||
&mov ($Zlh,&DWP(8,$Zhh));
|
||||
&mov ($Zhh,&DWP(0,$Zhh));
|
||||
&jmp (&label("mmx_outer_loop"));
|
||||
|
||||
&set_label("mmx_outer_loop",16);
|
||||
&xor ($Zll,&DWP(12,$inp));
|
||||
&xor ($Zhl,&DWP(4,$inp));
|
||||
&xor ($Zlh,&DWP(8,$inp));
|
||||
&xor ($Zhh,&DWP(0,$inp));
|
||||
&mov (&DWP(12,"esp"),$Zll);
|
||||
&mov (&DWP(4,"esp"),$Zhl);
|
||||
&mov (&DWP(8,"esp"),$Zlh);
|
||||
&mov (&DWP(0,"esp"),$Zhh);
|
||||
|
||||
&shr ($Zll,24);
|
||||
|
||||
&mmx_loop("esp","eax");
|
||||
|
||||
&lea ($inp,&DWP(16,$inp));
|
||||
&cmp ($inp,&wparam(3));
|
||||
&jb (&label("mmx_outer_loop"));
|
||||
|
||||
&mov ($inp,&wparam(0)); # load Xi
|
||||
&emms ();
|
||||
&mov (&DWP(12,$inp),$Zll);
|
||||
&mov (&DWP(4,$inp),$Zhl);
|
||||
&mov (&DWP(8,$inp),$Zlh);
|
||||
&mov (&DWP(0,$inp),$Zhh);
|
||||
|
||||
&stack_pop(4+1);
|
||||
&function_end("gcm_ghash_4bit_mmx");
|
||||
|
||||
if ($sse2) {{
|
||||
######################################################################
|
||||
# PCLMULQDQ version.
|
||||
|
||||
$Xip="eax";
|
||||
$Htbl="edx";
|
||||
$const="ecx";
|
||||
$inp="esi";
|
||||
$len="ebx";
|
||||
|
||||
($Xi,$Xhi)=("xmm0","xmm1"); $Hkey="xmm2";
|
||||
($T1,$T2,$T3)=("xmm3","xmm4","xmm5");
|
||||
($Xn,$Xhn)=("xmm6","xmm7");
|
||||
|
||||
&static_label("bswap");
|
||||
|
||||
sub clmul64x64_T2 { # minimal "register" pressure
|
||||
my ($Xhi,$Xi,$Hkey)=@_;
|
||||
|
||||
&movdqa ($Xhi,$Xi); #
|
||||
&pshufd ($T1,$Xi,0b01001110);
|
||||
&pshufd ($T2,$Hkey,0b01001110);
|
||||
&pxor ($T1,$Xi); #
|
||||
&pxor ($T2,$Hkey);
|
||||
|
||||
&pclmulqdq ($Xi,$Hkey,0x00); #######
|
||||
&pclmulqdq ($Xhi,$Hkey,0x11); #######
|
||||
&pclmulqdq ($T1,$T2,0x00); #######
|
||||
&pxor ($T1,$Xi); #
|
||||
&pxor ($T1,$Xhi); #
|
||||
|
||||
&movdqa ($T2,$T1); #
|
||||
&psrldq ($T1,8);
|
||||
&pslldq ($T2,8); #
|
||||
&pxor ($Xhi,$T1);
|
||||
&pxor ($Xi,$T2); #
|
||||
}
|
||||
|
||||
sub clmul64x64_T3 {
|
||||
# Even though this subroutine offers visually better ILP, it
|
||||
# was empirically found to be a tad slower than above version.
|
||||
# At least in gcm_ghash_clmul context. But it's just as well,
|
||||
# because loop modulo-scheduling is possible only thanks to
|
||||
# minimized "register" pressure...
|
||||
my ($Xhi,$Xi,$Hkey)=@_;
|
||||
|
||||
&movdqa ($T1,$Xi); #
|
||||
&movdqa ($Xhi,$Xi);
|
||||
&pclmulqdq ($Xi,$Hkey,0x00); #######
|
||||
&pclmulqdq ($Xhi,$Hkey,0x11); #######
|
||||
&pshufd ($T2,$T1,0b01001110); #
|
||||
&pshufd ($T3,$Hkey,0b01001110);
|
||||
&pxor ($T2,$T1); #
|
||||
&pxor ($T3,$Hkey);
|
||||
&pclmulqdq ($T2,$T3,0x00); #######
|
||||
&pxor ($T2,$Xi); #
|
||||
&pxor ($T2,$Xhi); #
|
||||
|
||||
&movdqa ($T3,$T2); #
|
||||
&psrldq ($T2,8);
|
||||
&pslldq ($T3,8); #
|
||||
&pxor ($Xhi,$T2);
|
||||
&pxor ($Xi,$T3); #
|
||||
}
|
||||
|
||||
if (1) { # Algorithm 9 with <<1 twist.
|
||||
# Reduction is shorter and uses only two
|
||||
# temporary registers, which makes it better
|
||||
# candidate for interleaving with 64x64
|
||||
# multiplication. Pre-modulo-scheduled loop
|
||||
# was found to be ~20% faster than Algorithm 5
|
||||
# below. Algorithm 9 was then chosen and
|
||||
# optimized further...
|
||||
|
||||
sub reduction_alg9 { # 17/13 times faster than Intel version
|
||||
my ($Xhi,$Xi) = @_;
|
||||
|
||||
# 1st phase
|
||||
&movdqa ($T1,$Xi) #
|
||||
&psllq ($Xi,1);
|
||||
&pxor ($Xi,$T1); #
|
||||
&psllq ($Xi,5); #
|
||||
&pxor ($Xi,$T1); #
|
||||
&psllq ($Xi,57); #
|
||||
&movdqa ($T2,$Xi); #
|
||||
&pslldq ($Xi,8);
|
||||
&psrldq ($T2,8); #
|
||||
&pxor ($Xi,$T1);
|
||||
&pxor ($Xhi,$T2); #
|
||||
|
||||
# 2nd phase
|
||||
&movdqa ($T2,$Xi);
|
||||
&psrlq ($Xi,5);
|
||||
&pxor ($Xi,$T2); #
|
||||
&psrlq ($Xi,1); #
|
||||
&pxor ($Xi,$T2); #
|
||||
&pxor ($T2,$Xhi);
|
||||
&psrlq ($Xi,1); #
|
||||
&pxor ($Xi,$T2); #
|
||||
}
|
||||
|
||||
&function_begin_B("gcm_init_clmul");
|
||||
&mov ($Htbl,&wparam(0));
|
||||
&mov ($Xip,&wparam(1));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Hkey,&QWP(0,$Xip));
|
||||
&pshufd ($Hkey,$Hkey,0b01001110);# dword swap
|
||||
|
||||
# <<1 twist
|
||||
&pshufd ($T2,$Hkey,0b11111111); # broadcast uppermost dword
|
||||
&movdqa ($T1,$Hkey);
|
||||
&psllq ($Hkey,1);
|
||||
&pxor ($T3,$T3); #
|
||||
&psrlq ($T1,63);
|
||||
&pcmpgtd ($T3,$T2); # broadcast carry bit
|
||||
&pslldq ($T1,8);
|
||||
&por ($Hkey,$T1); # H<<=1
|
||||
|
||||
# magic reduction
|
||||
&pand ($T3,&QWP(16,$const)); # 0x1c2_polynomial
|
||||
&pxor ($Hkey,$T3); # if(carry) H^=0x1c2_polynomial
|
||||
|
||||
# calculate H^2
|
||||
&movdqa ($Xi,$Hkey);
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
|
||||
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
|
||||
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
|
||||
|
||||
&ret ();
|
||||
&function_end_B("gcm_init_clmul");
|
||||
|
||||
&function_begin_B("gcm_gmult_clmul");
|
||||
&mov ($Xip,&wparam(0));
|
||||
&mov ($Htbl,&wparam(1));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Xi,&QWP(0,$Xip));
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl));
|
||||
&pshufb ($Xi,$T3);
|
||||
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
|
||||
&pshufb ($Xi,$T3);
|
||||
&movdqu (&QWP(0,$Xip),$Xi);
|
||||
|
||||
&ret ();
|
||||
&function_end_B("gcm_gmult_clmul");
|
||||
|
||||
&function_begin("gcm_ghash_clmul");
|
||||
&mov ($Xip,&wparam(0));
|
||||
&mov ($Htbl,&wparam(1));
|
||||
&mov ($inp,&wparam(2));
|
||||
&mov ($len,&wparam(3));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Xi,&QWP(0,$Xip));
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl));
|
||||
&pshufb ($Xi,$T3);
|
||||
|
||||
&sub ($len,0x10);
|
||||
&jz (&label("odd_tail"));
|
||||
|
||||
#######
|
||||
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
||||
# [(H*Ii+1) + (H*Xi+1)] mod P =
|
||||
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
||||
#
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||
&pshufb ($T1,$T3);
|
||||
&pshufb ($Xn,$T3);
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
|
||||
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
|
||||
&lea ($inp,&DWP(32,$inp)); # i+=2
|
||||
&sub ($len,0x20);
|
||||
&jbe (&label("even_tail"));
|
||||
|
||||
&set_label("mod_loop");
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
|
||||
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
&pxor ($Xhi,$Xhn);
|
||||
|
||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||
&pshufb ($T1,$T3);
|
||||
&pshufb ($Xn,$T3);
|
||||
|
||||
&movdqa ($T3,$Xn); #&clmul64x64_TX ($Xhn,$Xn,$Hkey); H*Ii+1
|
||||
&movdqa ($Xhn,$Xn);
|
||||
&pxor ($Xhi,$T1); # "Ii+Xi", consume early
|
||||
|
||||
&movdqa ($T1,$Xi) #&reduction_alg9($Xhi,$Xi); 1st phase
|
||||
&psllq ($Xi,1);
|
||||
&pxor ($Xi,$T1); #
|
||||
&psllq ($Xi,5); #
|
||||
&pxor ($Xi,$T1); #
|
||||
&pclmulqdq ($Xn,$Hkey,0x00); #######
|
||||
&psllq ($Xi,57); #
|
||||
&movdqa ($T2,$Xi); #
|
||||
&pslldq ($Xi,8);
|
||||
&psrldq ($T2,8); #
|
||||
&pxor ($Xi,$T1);
|
||||
&pshufd ($T1,$T3,0b01001110);
|
||||
&pxor ($Xhi,$T2); #
|
||||
&pxor ($T1,$T3);
|
||||
&pshufd ($T3,$Hkey,0b01001110);
|
||||
&pxor ($T3,$Hkey); #
|
||||
|
||||
&pclmulqdq ($Xhn,$Hkey,0x11); #######
|
||||
&movdqa ($T2,$Xi); # 2nd phase
|
||||
&psrlq ($Xi,5);
|
||||
&pxor ($Xi,$T2); #
|
||||
&psrlq ($Xi,1); #
|
||||
&pxor ($Xi,$T2); #
|
||||
&pxor ($T2,$Xhi);
|
||||
&psrlq ($Xi,1); #
|
||||
&pxor ($Xi,$T2); #
|
||||
|
||||
&pclmulqdq ($T1,$T3,0x00); #######
|
||||
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
&pxor ($T1,$Xn); #
|
||||
&pxor ($T1,$Xhn); #
|
||||
|
||||
&movdqa ($T3,$T1); #
|
||||
&psrldq ($T1,8);
|
||||
&pslldq ($T3,8); #
|
||||
&pxor ($Xhn,$T1);
|
||||
&pxor ($Xn,$T3); #
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
|
||||
&lea ($inp,&DWP(32,$inp));
|
||||
&sub ($len,0x20);
|
||||
&ja (&label("mod_loop"));
|
||||
|
||||
&set_label("even_tail");
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
|
||||
|
||||
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
&pxor ($Xhi,$Xhn);
|
||||
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
|
||||
&test ($len,$len);
|
||||
&jnz (&label("done"));
|
||||
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
&set_label("odd_tail");
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&pshufb ($T1,$T3);
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
|
||||
&set_label("done");
|
||||
&pshufb ($Xi,$T3);
|
||||
&movdqu (&QWP(0,$Xip),$Xi);
|
||||
&function_end("gcm_ghash_clmul");
|
||||
|
||||
} else { # Algorith 5. Kept for reference purposes.
|
||||
|
||||
sub reduction_alg5 { # 19/16 times faster than Intel version
|
||||
my ($Xhi,$Xi)=@_;
|
||||
|
||||
# <<1
|
||||
&movdqa ($T1,$Xi); #
|
||||
&movdqa ($T2,$Xhi);
|
||||
&pslld ($Xi,1);
|
||||
&pslld ($Xhi,1); #
|
||||
&psrld ($T1,31);
|
||||
&psrld ($T2,31); #
|
||||
&movdqa ($T3,$T1);
|
||||
&pslldq ($T1,4);
|
||||
&psrldq ($T3,12); #
|
||||
&pslldq ($T2,4);
|
||||
&por ($Xhi,$T3); #
|
||||
&por ($Xi,$T1);
|
||||
&por ($Xhi,$T2); #
|
||||
|
||||
# 1st phase
|
||||
&movdqa ($T1,$Xi);
|
||||
&movdqa ($T2,$Xi);
|
||||
&movdqa ($T3,$Xi); #
|
||||
&pslld ($T1,31);
|
||||
&pslld ($T2,30);
|
||||
&pslld ($Xi,25); #
|
||||
&pxor ($T1,$T2);
|
||||
&pxor ($T1,$Xi); #
|
||||
&movdqa ($T2,$T1); #
|
||||
&pslldq ($T1,12);
|
||||
&psrldq ($T2,4); #
|
||||
&pxor ($T3,$T1);
|
||||
|
||||
# 2nd phase
|
||||
&pxor ($Xhi,$T3); #
|
||||
&movdqa ($Xi,$T3);
|
||||
&movdqa ($T1,$T3);
|
||||
&psrld ($Xi,1); #
|
||||
&psrld ($T1,2);
|
||||
&psrld ($T3,7); #
|
||||
&pxor ($Xi,$T1);
|
||||
&pxor ($Xhi,$T2);
|
||||
&pxor ($Xi,$T3); #
|
||||
&pxor ($Xi,$Xhi); #
|
||||
}
|
||||
|
||||
&function_begin_B("gcm_init_clmul");
|
||||
&mov ($Htbl,&wparam(0));
|
||||
&mov ($Xip,&wparam(1));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Hkey,&QWP(0,$Xip));
|
||||
&pshufd ($Hkey,$Hkey,0b01001110);# dword swap
|
||||
|
||||
# calculate H^2
|
||||
&movdqa ($Xi,$Hkey);
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
&movdqu (&QWP(0,$Htbl),$Hkey); # save H
|
||||
&movdqu (&QWP(16,$Htbl),$Xi); # save H^2
|
||||
|
||||
&ret ();
|
||||
&function_end_B("gcm_init_clmul");
|
||||
|
||||
&function_begin_B("gcm_gmult_clmul");
|
||||
&mov ($Xip,&wparam(0));
|
||||
&mov ($Htbl,&wparam(1));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Xi,&QWP(0,$Xip));
|
||||
&movdqa ($Xn,&QWP(0,$const));
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl));
|
||||
&pshufb ($Xi,$Xn);
|
||||
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
&pshufb ($Xi,$Xn);
|
||||
&movdqu (&QWP(0,$Xip),$Xi);
|
||||
|
||||
&ret ();
|
||||
&function_end_B("gcm_gmult_clmul");
|
||||
|
||||
&function_begin("gcm_ghash_clmul");
|
||||
&mov ($Xip,&wparam(0));
|
||||
&mov ($Htbl,&wparam(1));
|
||||
&mov ($inp,&wparam(2));
|
||||
&mov ($len,&wparam(3));
|
||||
|
||||
&call (&label("pic"));
|
||||
&set_label("pic");
|
||||
&blindpop ($const);
|
||||
&lea ($const,&DWP(&label("bswap")."-".&label("pic"),$const));
|
||||
|
||||
&movdqu ($Xi,&QWP(0,$Xip));
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl));
|
||||
&pshufb ($Xi,$T3);
|
||||
|
||||
&sub ($len,0x10);
|
||||
&jz (&label("odd_tail"));
|
||||
|
||||
#######
|
||||
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
||||
# [(H*Ii+1) + (H*Xi+1)] mod P =
|
||||
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
||||
#
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||
&pshufb ($T1,$T3);
|
||||
&pshufb ($Xn,$T3);
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
|
||||
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
|
||||
&sub ($len,0x20);
|
||||
&lea ($inp,&DWP(32,$inp)); # i+=2
|
||||
&jbe (&label("even_tail"));
|
||||
|
||||
&set_label("mod_loop");
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
|
||||
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
&pxor ($Xhi,$Xhn);
|
||||
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
#######
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||
&pshufb ($T1,$T3);
|
||||
&pshufb ($Xn,$T3);
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&clmul64x64_T3 ($Xhn,$Xn,$Hkey); # H*Ii+1
|
||||
&movdqu ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
|
||||
&sub ($len,0x20);
|
||||
&lea ($inp,&DWP(32,$inp));
|
||||
&ja (&label("mod_loop"));
|
||||
|
||||
&set_label("even_tail");
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H^2*(Ii+Xi)
|
||||
|
||||
&pxor ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
&pxor ($Xhi,$Xhn);
|
||||
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&test ($len,$len);
|
||||
&jnz (&label("done"));
|
||||
|
||||
&movdqu ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
&set_label("odd_tail");
|
||||
&movdqu ($T1,&QWP(0,$inp)); # Ii
|
||||
&pshufb ($T1,$T3);
|
||||
&pxor ($Xi,$T1); # Ii+Xi
|
||||
|
||||
&clmul64x64_T3 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
|
||||
&reduction_alg5 ($Xhi,$Xi);
|
||||
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&set_label("done");
|
||||
&pshufb ($Xi,$T3);
|
||||
&movdqu (&QWP(0,$Xip),$Xi);
|
||||
&function_end("gcm_ghash_clmul");
|
||||
|
||||
}
|
||||
|
||||
&set_label("bswap",64);
|
||||
&data_byte(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
|
||||
&data_byte(1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2); # 0x1c2_polynomial
|
||||
}} # $sse2
|
||||
|
||||
&set_label("rem_4bit",64);
|
||||
&data_word(0,0x0000<<16,0,0x1C20<<16,0,0x3840<<16,0,0x2460<<16);
|
||||
&data_word(0,0x7080<<16,0,0x6CA0<<16,0,0x48C0<<16,0,0x54E0<<16);
|
||||
&data_word(0,0xE100<<16,0,0xFD20<<16,0,0xD940<<16,0,0xC560<<16);
|
||||
&data_word(0,0x9180<<16,0,0x8DA0<<16,0,0xA9C0<<16,0,0xB5E0<<16);
|
||||
}
|
||||
}}} # !$x86only
|
||||
|
||||
&asciz("GHASH for x86, CRYPTOGAMS by <appro\@openssl.org>");
|
||||
&asm_finish();
|
||||
|
@ -20,6 +20,12 @@
|
||||
# Opteron 18.5 10.2 +80%
|
||||
# Core2 17.5 11.0 +59%
|
||||
|
||||
# May 2010
|
||||
#
|
||||
# Add PCLMULQDQ version performing at 2.07 cycles per processed byte.
|
||||
# See ghash-x86.pl for background information and details about coding
|
||||
# techniques.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||||
@ -51,7 +57,7 @@ $rem="%rdx";
|
||||
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
|
||||
$r =~ s/%[er]([sd]i)/%\1l/;
|
||||
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
|
||||
|
||||
|
||||
{ my $N;
|
||||
sub loop() {
|
||||
my $inp = shift;
|
||||
@ -156,8 +162,7 @@ $code.=<<___;
|
||||
ret
|
||||
.size gcm_gmult_4bit,.-gcm_gmult_4bit
|
||||
___
|
||||
|
||||
|
||||
|
||||
# per-function register layout
|
||||
$inp="%rdx";
|
||||
$len="%rcx";
|
||||
@ -203,9 +208,295 @@ $code.=<<___;
|
||||
.Lghash_epilogue:
|
||||
ret
|
||||
.size gcm_ghash_4bit,.-gcm_ghash_4bit
|
||||
___
|
||||
|
||||
######################################################################
|
||||
# PCLMULQDQ version.
|
||||
|
||||
@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
||||
("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
||||
|
||||
($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
|
||||
($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
|
||||
|
||||
sub clmul64x64_T2 { # minimal register pressure
|
||||
my ($Xhi,$Xi,$Hkey,$modulo)=@_;
|
||||
|
||||
$code.=<<___ if (!defined($modulo));
|
||||
movdqa $Xi,$Xhi #
|
||||
pshufd \$0b01001110,$Xi,$T1
|
||||
pshufd \$0b01001110,$Hkey,$T2
|
||||
pxor $Xi,$T1 #
|
||||
pxor $Hkey,$T2
|
||||
___
|
||||
$code.=<<___;
|
||||
pclmulqdq \$0x00,$Hkey,$Xi #######
|
||||
pclmulqdq \$0x11,$Hkey,$Xhi #######
|
||||
pclmulqdq \$0x00,$T2,$T1 #######
|
||||
pxor $Xi,$T1 #
|
||||
pxor $Xhi,$T1 #
|
||||
|
||||
movdqa $T1,$T2 #
|
||||
psrldq \$8,$T1
|
||||
pslldq \$8,$T2 #
|
||||
pxor $T1,$Xhi
|
||||
pxor $T2,$Xi #
|
||||
___
|
||||
}
|
||||
|
||||
sub reduction_alg9 { # 17/13 times faster than Intel version
|
||||
my ($Xhi,$Xi) = @_;
|
||||
|
||||
$code.=<<___;
|
||||
# 1st phase
|
||||
movdqa $Xi,$T1 #
|
||||
psllq \$1,$Xi
|
||||
pxor $T1,$Xi #
|
||||
psllq \$5,$Xi #
|
||||
pxor $T1,$Xi #
|
||||
psllq \$57,$Xi #
|
||||
movdqa $Xi,$T2 #
|
||||
pslldq \$8,$Xi
|
||||
psrldq \$8,$T2 #
|
||||
pxor $T1,$Xi
|
||||
pxor $T2,$Xhi #
|
||||
|
||||
# 2nd phase
|
||||
movdqa $Xi,$T2
|
||||
psrlq \$5,$Xi
|
||||
pxor $T2,$Xi #
|
||||
psrlq \$1,$Xi #
|
||||
pxor $T2,$Xi #
|
||||
pxor $Xhi,$T2
|
||||
psrlq \$1,$Xi #
|
||||
pxor $T2,$Xi #
|
||||
___
|
||||
}
|
||||
|
||||
{ my ($Htbl,$Xip)=@_4args;
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_init_clmul
|
||||
.type gcm_init_clmul,\@abi-omnipotent
|
||||
.align 16
|
||||
gcm_init_clmul:
|
||||
movdqu ($Xip),$Hkey
|
||||
pshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
||||
|
||||
# <<1 twist
|
||||
pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
||||
movdqa $Hkey,$T1
|
||||
psllq \$1,$Hkey
|
||||
pxor $T3,$T3 #
|
||||
psrlq \$63,$T1
|
||||
pcmpgtd $T2,$T3 # broadcast carry bit
|
||||
pslldq \$8,$T1
|
||||
por $T1,$Hkey # H<<=1
|
||||
|
||||
# magic reduction
|
||||
pand .L0x1c2_polynomial(%rip),$T3
|
||||
pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
|
||||
|
||||
# calculate H^2
|
||||
movdqa $Hkey,$Xi
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
$code.=<<___;
|
||||
movdqu $Hkey,($Htbl) # save H
|
||||
movdqu $Xi,16($Htbl) # save H^2
|
||||
ret
|
||||
.size gcm_init_clmul,.-gcm_init_clmul
|
||||
___
|
||||
}
|
||||
|
||||
{ my ($Xip,$Htbl)=@_4args;
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_gmult_clmul
|
||||
.type gcm_gmult_clmul,\@abi-omnipotent
|
||||
.align 16
|
||||
gcm_gmult_clmul:
|
||||
movdqu ($Xip),$Xi
|
||||
movdqa .Lbswap_mask(%rip),$T3
|
||||
movdqu ($Htbl),$Hkey
|
||||
pshufb $T3,$Xi
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey);
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
$code.=<<___;
|
||||
pshufb $T3,$Xi
|
||||
movdqu $Xi,($Xip)
|
||||
ret
|
||||
.size gcm_gmult_clmul,.-gcm_gmult_clmul
|
||||
___
|
||||
}
|
||||
|
||||
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
|
||||
my $Xn="%xmm6";
|
||||
my $Xhn="%xmm7";
|
||||
my $Hkey2="%xmm8";
|
||||
my $T1n="%xmm9";
|
||||
my $T2n="%xmm10";
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_ghash_clmul
|
||||
.type gcm_ghash_clmul,\@abi-omnipotent
|
||||
.align 16
|
||||
gcm_ghash_clmul:
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
.LSEH_begin_gcm_ghash_clmul:
|
||||
# I can't trust assembler to use specific encoding:-(
|
||||
.byte 0x48,0x83,0xec,0x58 #sub \$0x58,%rsp
|
||||
.byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
||||
.byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x44,0x24,0x20 #movaps %xmm8,0x20(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x4c,0x24,0x30 #movaps %xmm9,0x30(%rsp)
|
||||
.byte 0x44,0x0f,0x29,0x54,0x24,0x40 #movaps %xmm10,0x40(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
movdqa .Lbswap_mask(%rip),$T3
|
||||
|
||||
movdqu ($Xip),$Xi
|
||||
movdqu ($Htbl),$Hkey
|
||||
pshufb $T3,$Xi
|
||||
|
||||
sub \$0x10,$len
|
||||
jz .Lodd_tail
|
||||
|
||||
movdqu 16($Htbl),$Hkey2
|
||||
#######
|
||||
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
||||
# [(H*Ii+1) + (H*Xi+1)] mod P =
|
||||
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
||||
#
|
||||
movdqu ($inp),$T1 # Ii
|
||||
movdqu 16($inp),$Xn # Ii+1
|
||||
pshufb $T3,$T1
|
||||
pshufb $T3,$Xn
|
||||
pxor $T1,$Xi # Ii+Xi
|
||||
___
|
||||
&clmul64x64_T2 ($Xhn,$Xn,$Hkey); # H*Ii+1
|
||||
$code.=<<___;
|
||||
movdqa $Xi,$Xhi #
|
||||
pshufd \$0b01001110,$Xi,$T1
|
||||
pshufd \$0b01001110,$Hkey2,$T2
|
||||
pxor $Xi,$T1 #
|
||||
pxor $Hkey2,$T2
|
||||
|
||||
lea 32($inp),$inp # i+=2
|
||||
sub \$0x20,$len
|
||||
jbe .Leven_tail
|
||||
|
||||
.Lmod_loop:
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
|
||||
$code.=<<___;
|
||||
movdqu ($inp),$T1 # Ii
|
||||
pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
pxor $Xhn,$Xhi
|
||||
|
||||
movdqu 16($inp),$Xn # Ii+1
|
||||
pshufb $T3,$T1
|
||||
pshufb $T3,$Xn
|
||||
|
||||
movdqa $Xn,$Xhn #
|
||||
pshufd \$0b01001110,$Xn,$T1n
|
||||
pshufd \$0b01001110,$Hkey,$T2n
|
||||
pxor $Xn,$T1n #
|
||||
pxor $Hkey,$T2n
|
||||
pxor $T1,$Xhi # "Ii+Xi", consume early
|
||||
|
||||
movdqa $Xi,$T1 # 1st phase
|
||||
psllq \$1,$Xi
|
||||
pxor $T1,$Xi #
|
||||
psllq \$5,$Xi #
|
||||
pxor $T1,$Xi #
|
||||
pclmulqdq \$0x00,$Hkey,$Xn #######
|
||||
psllq \$57,$Xi #
|
||||
movdqa $Xi,$T2 #
|
||||
pslldq \$8,$Xi
|
||||
psrldq \$8,$T2 #
|
||||
pxor $T1,$Xi
|
||||
pxor $T2,$Xhi #
|
||||
|
||||
pclmulqdq \$0x11,$Hkey,$Xhn #######
|
||||
movdqa $Xi,$T2 # 2nd phase
|
||||
psrlq \$5,$Xi
|
||||
pxor $T2,$Xi #
|
||||
psrlq \$1,$Xi #
|
||||
pxor $T2,$Xi #
|
||||
pxor $Xhi,$T2
|
||||
psrlq \$1,$Xi #
|
||||
pxor $T2,$Xi #
|
||||
|
||||
pclmulqdq \$0x00,$T2n,$T1n #######
|
||||
movdqa $Xi,$Xhi #
|
||||
pshufd \$0b01001110,$Xi,$T1
|
||||
pshufd \$0b01001110,$Hkey2,$T2
|
||||
pxor $Xi,$T1 #
|
||||
pxor $Hkey2,$T2
|
||||
|
||||
pxor $Xn,$T1n #
|
||||
pxor $Xhn,$T1n #
|
||||
movdqa $T1n,$T2n #
|
||||
psrldq \$8,$T1n
|
||||
pslldq \$8,$T2n #
|
||||
pxor $T1n,$Xhn
|
||||
pxor $T2n,$Xn #
|
||||
|
||||
lea 32($inp),$inp
|
||||
sub \$0x20,$len
|
||||
ja .Lmod_loop
|
||||
|
||||
.Leven_tail:
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey2,1); # H^2*(Ii+Xi)
|
||||
$code.=<<___;
|
||||
pxor $Xn,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
pxor $Xhn,$Xhi
|
||||
___
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
$code.=<<___;
|
||||
test $len,$len
|
||||
jnz .Ldone
|
||||
|
||||
.Lodd_tail:
|
||||
movdqu ($inp),$T1 # Ii
|
||||
pshufb $T3,$T1
|
||||
pxor $T1,$Xi # Ii+Xi
|
||||
___
|
||||
&clmul64x64_T2 ($Xhi,$Xi,$Hkey); # H*(Ii+Xi)
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
$code.=<<___;
|
||||
.Ldone:
|
||||
pshufb $T3,$Xi
|
||||
movdqu $Xi,($Xip)
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps (%rsp),%xmm6
|
||||
movaps 0x10(%rsp),%xmm7
|
||||
movaps 0x20(%rsp),%xmm8
|
||||
movaps 0x30(%rsp),%xmm9
|
||||
movaps 0x40(%rsp),%xmm10
|
||||
add \$0x58,%rsp
|
||||
___
|
||||
$code.=<<___;
|
||||
ret
|
||||
.LSEH_end_gcm_ghash_clmul:
|
||||
.size gcm_ghash_clmul,.-gcm_ghash_clmul
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
.align 64
|
||||
.type rem_4bit,\@object
|
||||
.Lbswap_mask:
|
||||
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||||
.L0x1c2_polynomial:
|
||||
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
||||
.align 64
|
||||
.type .Lrem_4bit,\@object
|
||||
.Lrem_4bit:
|
||||
.long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
|
||||
.long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
|
||||
@ -214,7 +505,7 @@ $code.=<<___;
|
||||
.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.align 64
|
||||
___
|
||||
|
||||
|
||||
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
||||
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
||||
if ($win64) {
|
||||
@ -316,6 +607,10 @@ se_handler:
|
||||
.rva .LSEH_end_gcm_ghash_4bit
|
||||
.rva .LSEH_info_gcm_ghash_4bit
|
||||
|
||||
.rva .LSEH_begin_gcm_ghash_clmul
|
||||
.rva .LSEH_end_gcm_ghash_clmul
|
||||
.rva .LSEH_info_gcm_ghash_clmul
|
||||
|
||||
.section .xdata
|
||||
.align 8
|
||||
.LSEH_info_gcm_gmult_4bit:
|
||||
@ -326,9 +621,46 @@ se_handler:
|
||||
.byte 9,0,0,0
|
||||
.rva se_handler
|
||||
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
|
||||
.LSEH_info_gcm_ghash_clmul:
|
||||
.byte 0x01,0x1f,0x0b,0x00
|
||||
.byte 0x1f,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
|
||||
.byte 0x19,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
|
||||
.byte 0x13,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
|
||||
.byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
|
||||
.byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
|
||||
.byte 0x04,0xa2,0x00,0x00 #sub rsp,0x58
|
||||
___
|
||||
}
|
||||
|
||||
sub rex {
|
||||
local *opcode=shift;
|
||||
my ($dst,$src)=@_;
|
||||
|
||||
if ($dst>=8 || $src>=8) {
|
||||
$rex=0x40;
|
||||
$rex|=0x04 if($dst>=8);
|
||||
$rex|=0x01 if($src>=8);
|
||||
push @opcode,$rex;
|
||||
}
|
||||
}
|
||||
|
||||
sub pclmulqdq {
|
||||
my $arg=shift;
|
||||
my @opcode=(0x66);
|
||||
|
||||
if ($arg=~/\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
||||
rex(\@opcode,$3,$2);
|
||||
push @opcode,0x0f,0x3a,0x44;
|
||||
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
||||
my $c=$1;
|
||||
push @opcode,$c=~/^0/?oct($c):$c;
|
||||
return ".byte\t".join(',',@opcode);
|
||||
}
|
||||
return "pclmulqdq\t".$arg;
|
||||
}
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||
$code =~ s/\bpclmulqdq\s+(\$.*%xmm[0-9]+).*$/pclmulqdq($1)/gem;
|
||||
|
||||
print $code;
|
||||
|
||||
|
@ -67,7 +67,20 @@ typedef struct { u64 hi,lo; } u128;
|
||||
#define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
|
||||
#endif
|
||||
|
||||
#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
|
||||
#define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
|
||||
#define REDUCE1BIT(V) do { \
|
||||
if (sizeof(size_t)==8) { \
|
||||
u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
|
||||
V.lo = (V.hi<<63)|(V.lo>>1); \
|
||||
V.hi = (V.hi>>1 )^T; \
|
||||
} \
|
||||
else { \
|
||||
u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
|
||||
V.lo = (V.hi<<63)|(V.lo>>1); \
|
||||
V.hi = (V.hi>>1 )^((u64)T<<32); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#ifdef TABLE_BITS
|
||||
#undef TABLE_BITS
|
||||
#endif
|
||||
@ -75,15 +88,14 @@ typedef struct { u64 hi,lo; } u128;
|
||||
* Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
|
||||
* never be set to 8. 8 is effectively reserved for testing purposes.
|
||||
* Under ideal conditions "8-bit" version should be twice as fast as
|
||||
* "4-bit" one. But world is far from ideal. For gcc-generated x86 code,
|
||||
* "8-bit" was observed to run only ~50% faster. On x86_64 observed
|
||||
* improvement was ~75%, much closer to optimal, but the fact of
|
||||
* deviation means that references to pre-computed tables end up on
|
||||
* critical path and as tables are pretty big, 4KB per key+1KB shared,
|
||||
* execution time is sensitive to cache timing. It's not actually
|
||||
* proven, but 4-bit procedure is believed to provide adequate
|
||||
* all-round performance...
|
||||
*/
|
||||
* "4-bit" one. For gcc-generated x86[_64] code, "8-bit" was observed to
|
||||
* run ~75% faster, closer to 100% for commercial compilers... But the
|
||||
* catch is that "8-bit" procedure consumes 16 times more memory, 4KB
|
||||
* per indivudual key + 1KB shared, and as access to these tables end up
|
||||
* on critical path, real-life execution time would be sensitive to
|
||||
* cache timing. It's not actually proven, but "4-bit" procedure is
|
||||
* believed to provide adequate all-round performance...
|
||||
*/
|
||||
#define TABLE_BITS 4
|
||||
|
||||
#if TABLE_BITS==8
|
||||
@ -99,16 +111,7 @@ static void gcm_init_8bit(u128 Htable[256], u64 H[2])
|
||||
V.lo = H[1];
|
||||
|
||||
for (Htable[128]=V, i=64; i>0; i>>=1) {
|
||||
if (sizeof(size_t)==8) {
|
||||
u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
|
||||
V.lo = (V.hi<<63)|(V.lo>>1);
|
||||
V.hi = (V.hi>>1 )^T;
|
||||
}
|
||||
else {
|
||||
u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
|
||||
V.lo = (V.hi<<63)|(V.lo>>1);
|
||||
V.hi = (V.hi>>1 )^((u64)T<<32);
|
||||
}
|
||||
REDUCE1BIT(V);
|
||||
Htable[i] = V;
|
||||
}
|
||||
|
||||
@ -238,18 +241,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
|
||||
#if defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
int i;
|
||||
#endif
|
||||
#define REDUCE(V) do { \
|
||||
if (sizeof(size_t)==8) { \
|
||||
u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
|
||||
V.lo = (V.hi<<63)|(V.lo>>1); \
|
||||
V.hi = (V.hi>>1 )^T; \
|
||||
} \
|
||||
else { \
|
||||
u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
|
||||
V.lo = (V.hi<<63)|(V.lo>>1); \
|
||||
V.hi = (V.hi>>1 )^((u64)T<<32); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
Htable[0].hi = 0;
|
||||
Htable[0].lo = 0;
|
||||
@ -258,7 +249,7 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
|
||||
|
||||
#if defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
for (Htable[8]=V, i=4; i>0; i>>=1) {
|
||||
REDUCE(V);
|
||||
REDUCE1BIT(V);
|
||||
Htable[i] = V;
|
||||
}
|
||||
|
||||
@ -272,11 +263,11 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
|
||||
}
|
||||
#else
|
||||
Htable[8] = V;
|
||||
REDUCE(V);
|
||||
REDUCE1BIT(V);
|
||||
Htable[4] = V;
|
||||
REDUCE(V);
|
||||
REDUCE1BIT(V);
|
||||
Htable[2] = V;
|
||||
REDUCE(V);
|
||||
REDUCE1BIT(V);
|
||||
Htable[1] = V;
|
||||
Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
|
||||
V=Htable[4];
|
||||
@ -314,7 +305,6 @@ static void gcm_init_4bit(u128 Htable[16], u64 H[2])
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#undef REDUCE
|
||||
}
|
||||
|
||||
#ifndef GHASH_ASM
|
||||
@ -471,7 +461,7 @@ void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||||
|
||||
#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
|
||||
#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||
#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
|
||||
#define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
|
||||
/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
|
||||
* trashing effect. In other words idea is to hash data while it's
|
||||
* still in L1 cache after encryption pass... */
|
||||
@ -514,17 +504,7 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
|
||||
Z.hi ^= V.hi&M;
|
||||
Z.lo ^= V.lo&M;
|
||||
|
||||
if (sizeof(size_t)==8) {
|
||||
u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
|
||||
V.lo = (V.hi<<63)|(V.lo>>1);
|
||||
V.hi = (V.hi>>1 )^T;
|
||||
}
|
||||
else {
|
||||
u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
|
||||
V.lo = (V.hi<<63)|(V.lo>>1);
|
||||
V.hi = (V.hi>>1 )^((u64)T<<32);
|
||||
}
|
||||
|
||||
REDUCE1BIT(V);
|
||||
}
|
||||
}
|
||||
|
||||
@ -559,12 +539,40 @@ struct gcm128_context {
|
||||
u128 Htable[256];
|
||||
#else
|
||||
u128 Htable[16];
|
||||
void (*gmult)(u64 Xi[2],const u128 Htable[16]);
|
||||
void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||||
#endif
|
||||
unsigned int res, pad;
|
||||
block128_f block;
|
||||
void *key;
|
||||
};
|
||||
|
||||
#if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
|
||||
(defined(__i386) || defined(__i386__) || \
|
||||
defined(__x86_64) || defined(__x86_64__) || \
|
||||
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
|
||||
# define GHASH_ASM_IAX
|
||||
extern unsigned int OPENSSL_ia32cap_P[2];
|
||||
|
||||
void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
|
||||
void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
|
||||
void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||||
|
||||
# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
|
||||
# define GHASH_ASM_X86
|
||||
void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
|
||||
void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||||
|
||||
void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
|
||||
void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||||
# endif
|
||||
|
||||
# undef GCM_MUL
|
||||
# define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
|
||||
# undef GHASH
|
||||
# define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
|
||||
#endif
|
||||
|
||||
void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
|
||||
{
|
||||
const union { long one; char little; } is_endian = {1};
|
||||
@ -593,7 +601,29 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
|
||||
#if TABLE_BITS==8
|
||||
gcm_init_8bit(ctx->Htable,ctx->H.u);
|
||||
#elif TABLE_BITS==4
|
||||
# if defined(GHASH_ASM_IAX)
|
||||
if (OPENSSL_ia32cap_P[1]&(1<<1)) {
|
||||
gcm_init_clmul(ctx->Htable,ctx->H.u);
|
||||
ctx->gmult = gcm_gmult_clmul;
|
||||
ctx->ghash = gcm_ghash_clmul;
|
||||
return;
|
||||
}
|
||||
gcm_init_4bit(ctx->Htable,ctx->H.u);
|
||||
# if defined(GHASH_ASM_X86)
|
||||
if (OPENSSL_ia32cap_P[0]&(1<<23)) {
|
||||
ctx->gmult = gcm_gmult_4bit_mmx;
|
||||
ctx->ghash = gcm_ghash_4bit_mmx;
|
||||
} else {
|
||||
ctx->gmult = gcm_gmult_4bit_x86;
|
||||
ctx->ghash = gcm_ghash_4bit_x86;
|
||||
}
|
||||
# else
|
||||
ctx->gmult = gcm_gmult_4bit;
|
||||
ctx->ghash = gcm_ghash_4bit;
|
||||
# endif
|
||||
# else
|
||||
gcm_init_4bit(ctx->Htable,ctx->H.u);
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -671,7 +701,7 @@ void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
|
||||
|
||||
#ifdef GHASH
|
||||
if ((i = (len&(size_t)-16))) {
|
||||
GHASH(aad,i,ctx);
|
||||
GHASH(ctx,aad,i);
|
||||
aad += i;
|
||||
len -= i;
|
||||
}
|
||||
@ -740,7 +770,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
|
||||
in += 16;
|
||||
j -= 16;
|
||||
}
|
||||
GHASH(out-GHASH_CHUNK,GHASH_CHUNK,ctx);
|
||||
GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
|
||||
len -= GHASH_CHUNK;
|
||||
}
|
||||
if ((i = (len&(size_t)-16))) {
|
||||
@ -760,7 +790,7 @@ void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
|
||||
in += 16;
|
||||
len -= 16;
|
||||
}
|
||||
GHASH(out-j,j,ctx);
|
||||
GHASH(ctx,out-j,j);
|
||||
}
|
||||
#else
|
||||
while (len>=16) {
|
||||
@ -854,7 +884,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
|
||||
while (len>=GHASH_CHUNK) {
|
||||
size_t j=GHASH_CHUNK;
|
||||
|
||||
GHASH(in,GHASH_CHUNK,ctx);
|
||||
GHASH(ctx,in,GHASH_CHUNK);
|
||||
while (j) {
|
||||
(*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
|
||||
++ctr;
|
||||
@ -872,7 +902,7 @@ void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
|
||||
len -= GHASH_CHUNK;
|
||||
}
|
||||
if ((i = (len&(size_t)-16))) {
|
||||
GHASH(in,i,ctx);
|
||||
GHASH(ctx,in,i);
|
||||
while (len>=16) {
|
||||
(*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
|
||||
++ctr;
|
||||
@ -1243,6 +1273,7 @@ int main()
|
||||
{
|
||||
size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
|
||||
union { u64 u; u8 c[1024]; } buf;
|
||||
int i;
|
||||
|
||||
AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
|
||||
CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
|
||||
@ -1267,11 +1298,11 @@ int main()
|
||||
ctr_t/(double)sizeof(buf),
|
||||
(gcm_t-ctr_t)/(double)sizeof(buf));
|
||||
#ifdef GHASH
|
||||
GHASH(buf.c,sizeof(buf),&ctx);
|
||||
GHASH(&ctx,buf.c,sizeof(buf));
|
||||
start = OPENSSL_rdtsc();
|
||||
GHASH(buf.c,sizeof(buf),&ctx);
|
||||
for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
|
||||
gcm_t = OPENSSL_rdtsc() - start;
|
||||
printf("%.2f\n",gcm_t/(double)sizeof(buf));
|
||||
printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user