This leaves behind files with names ending with '.iso-8859-1'. These should be safe to remove. If something went wrong when re-encoding, there will be some files with names ending with '.utf8' left behind. Reviewed-by: Rich Salz <rsalz@openssl.org>
496 lines
19 KiB
496 lines
19 KiB
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
# whirlpool_block_mmx implementation.
*SCALE=\(2); # 2 or 8, that is the question:-) Value of 8 results
# in 16KB large table, which is tough on L1 cache, but eliminates
# unaligned references to it. Value of 2 results in 4KB table, but
# 7/8 of references to it are unaligned. AMD cores seem to be
# allergic to the latter, while Intel ones - to former [see the
# table]. I stick to value of 2 for two reasons: 1. smaller table
# minimizes cache trashing and thus mitigates the hazard of side-
# channel leakage similar to AES cache-timing one; 2. performance
# gap among different µ-archs is smaller.
# Performance table lists rounded amounts of CPU cycles spent by
# whirlpool_block_mmx routine on single 64 byte input block, i.e.
# smaller is better and asymptotic throughput can be estimated by
# multiplying 64 by CPU clock frequency and dividing by relevant
# value from the given table:
# $SCALE=2/8 icc8 gcc3
# Intel P4 3200/4600 4600(*) 6400
# Intel PIII 2900/3000 4900 5400
# AMD K[78] 2500/1800 9900 8200(**)
# (*) I've sketched even non-MMX assembler, but for the record
# I've failed to beat the Intel compiler on P4, without using
# MMX that is...
# (**) ... on AMD on the other hand non-MMX assembler was observed
# to perform significantly better, but I figured this MMX
# implementation is even faster anyway, so why bother? As for
# pre-MMX AMD core[s], the improvement coefficient is more
# than likely to vary anyway and I don't know how. But the
# least I know is that gcc-generated code compiled with
# details] and optimized for Pentium was observed to perform
# *better* on Pentium 100 than unrolled non-MMX assembler
# loop... So we just say that I don't know if maintaining
# non-MMX implementation would actually pay off, but till
# opposite is proved "unlikely" is assumed.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
require "x86asm.pl";
sub L() { &data_byte(@_); }
sub LL()
{ if ($SCALE==2) { &data_byte(@_); &data_byte(@_); }
elsif ($SCALE==8) { for ($i=0;$i<8;$i++) {
else { die "unvalid SCALE value"; }
sub scale()
{ if ($SCALE==2) { &lea(@_[0],&DWP(0,@_[1],@_[1])); }
elsif ($SCALE==8) { &lea(@_[0],&DWP(0,"",@_[1],8)); }
else { die "unvalid SCALE value"; }
sub row()
{ if ($SCALE==2) { ((8-shift)&7); }
elsif ($SCALE==8) { (8*shift); }
else { die "unvalid SCALE value"; }
&push ("ebp");
&push ("ebx");
&push ("esi");
&push ("edi");
&mov ("esi",&wparam(0)); # hash value
&mov ("edi",&wparam(1)); # input data stream
&mov ("ebp",&wparam(2)); # number of chunks in input
&mov ("eax","esp"); # copy stack pointer
&sub ("esp",128+20); # allocate frame
&and ("esp",-64); # align for cache-line
&lea ("ebx",&DWP(128,"esp"));
&mov (&DWP(0,"ebx"),"esi"); # save parameter block
&mov (&DWP(4,"ebx"),"edi");
&mov (&DWP(8,"ebx"),"ebp");
&mov (&DWP(16,"ebx"),"eax"); # saved stack pointer
&call (&label("pic_point"));
&lea ($tbl,&DWP(&label("table")."-".&label("pic_point"),$tbl));
&xor ("ecx","ecx");
&xor ("edx","edx");
for($i=0;$i<8;$i++) { &movq(@mm[$i],&QWP($i*8,"esi")); } # L=H
for($i=0;$i<8;$i++) { &movq(&QWP($i*8,"esp"),@mm[$i]); } # K=L
for($i=0;$i<8;$i++) { &pxor(@mm[$i],&QWP($i*8,"edi")); } # L^=inp
for($i=0;$i<8;$i++) { &movq(&QWP(64+$i*8,"esp"),@mm[$i]); } # S=L
&xor ("esi","esi");
&mov (&DWP(12,"ebx"),"esi"); # zero round counter
&movq (@mm[0],&QWP(2048*$SCALE,$tbl,"esi",8)); # rc[r]
&mov ("eax",&DWP(0,"esp"));
&mov ("ebx",&DWP(4,"esp"));
&movz ("ecx",&LB("eax"));
&movz ("edx",&HB("eax"));
for($i=0;$i<8;$i++) {
my $func = ($i==0)? \&movq : \&pxor;
&shr ("eax",16);
&scale ("esi","ecx");
&movz ("ecx",&LB("eax"));
&scale ("edi","edx");
&movz ("edx",&HB("eax"));
&pxor (@mm[0],&QWP(&row(0),$tbl,"esi",8));
&$func (@mm[1],&QWP(&row(1),$tbl,"edi",8));
&mov ("eax",&DWP(($i+1)*8,"esp"));
&scale ("esi","ecx");
&movz ("ecx",&LB("ebx"));
&scale ("edi","edx");
&movz ("edx",&HB("ebx"));
&$func (@mm[2],&QWP(&row(2),$tbl,"esi",8));
&$func (@mm[3],&QWP(&row(3),$tbl,"edi",8));
&shr ("ebx",16);
&scale ("esi","ecx");
&movz ("ecx",&LB("ebx"));
&scale ("edi","edx");
&movz ("edx",&HB("ebx"));
&$func (@mm[4],&QWP(&row(4),$tbl,"esi",8));
&$func (@mm[5],&QWP(&row(5),$tbl,"edi",8));
&mov ("ebx",&DWP(($i+1)*8+4,"esp"));
&scale ("esi","ecx");
&movz ("ecx",&LB("eax"));
&scale ("edi","edx");
&movz ("edx",&HB("eax"));
&$func (@mm[6],&QWP(&row(6),$tbl,"esi",8));
&$func (@mm[7],&QWP(&row(7),$tbl,"edi",8));
for($i=0;$i<8;$i++) { &movq(&QWP($i*8,"esp"),@mm[$i]); } # K=L
for($i=0;$i<8;$i++) {
&shr ("eax",16);
&scale ("esi","ecx");
&movz ("ecx",&LB("eax"));
&scale ("edi","edx");
&movz ("edx",&HB("eax"));
&pxor (@mm[0],&QWP(&row(0),$tbl,"esi",8));
&pxor (@mm[1],&QWP(&row(1),$tbl,"edi",8));
&mov ("eax",&DWP(64+($i+1)*8,"esp")) if ($i<7);
&scale ("esi","ecx");
&movz ("ecx",&LB("ebx"));
&scale ("edi","edx");
&movz ("edx",&HB("ebx"));
&pxor (@mm[2],&QWP(&row(2),$tbl,"esi",8));
&pxor (@mm[3],&QWP(&row(3),$tbl,"edi",8));
&shr ("ebx",16);
&scale ("esi","ecx");
&movz ("ecx",&LB("ebx"));
&scale ("edi","edx");
&movz ("edx",&HB("ebx"));
&pxor (@mm[4],&QWP(&row(4),$tbl,"esi",8));
&pxor (@mm[5],&QWP(&row(5),$tbl,"edi",8));
&mov ("ebx",&DWP(64+($i+1)*8+4,"esp")) if ($i<7);
&scale ("esi","ecx");
&movz ("ecx",&LB("eax"));
&scale ("edi","edx");
&movz ("edx",&HB("eax"));
&pxor (@mm[6],&QWP(&row(6),$tbl,"esi",8));
&pxor (@mm[7],&QWP(&row(7),$tbl,"edi",8));
&lea ("ebx",&DWP(128,"esp"));
&mov ("esi",&DWP(12,"ebx")); # pull round counter
&add ("esi",1);
&cmp ("esi",10);
&je (&label("roundsdone"));
&mov (&DWP(12,"ebx"),"esi"); # update round counter
for($i=0;$i<8;$i++) { &movq(&QWP(64+$i*8,"esp"),@mm[$i]); } # S=L
&jmp (&label("round"));
&mov ("esi",&DWP(0,"ebx")); # reload argument block
&mov ("edi",&DWP(4,"ebx"));
&mov ("eax",&DWP(8,"ebx"));
for($i=0;$i<8;$i++) { &pxor(@mm[$i],&QWP($i*8,"edi")); } # L^=inp
for($i=0;$i<8;$i++) { &pxor(@mm[$i],&QWP($i*8,"esi")); } # L^=H
for($i=0;$i<8;$i++) { &movq(&QWP($i*8,"esi"),@mm[$i]); } # H=L
&lea ("edi",&DWP(64,"edi")); # inp+=64
&sub ("eax",1); # num--
&jz (&label("alldone"));
&mov (&DWP(4,"ebx"),"edi"); # update argument block
&mov (&DWP(8,"ebx"),"eax");
&jmp (&label("outerloop"));
&emms ();
&mov ("esp",&DWP(16,"ebx")); # restore saved stack pointer
&pop ("edi");
&pop ("esi");
&pop ("ebx");
&pop ("ebp");
&ret ();
&L(0x18,0x23,0xc6,0xe8,0x87,0xb8,0x01,0x4f); # rc[ROUNDS]