This update implements following improvements.

1. Original submission required minor modification to RC4_set_key, which
   we don't want to tolerate and therefore we fix assembler instead.
2. Eliminate remaining byte-order dependence [look for RC4_BIG_ENDIAN].
3. Eliminate logical error [when key->x is referred prior key is verified].
4. HP-UX assembler puked on MODSCHED_RC4 macro with "syntax error,"
   macro has to be splitted in two.
5. Deploy parallel compare in function prologue.
6. Eliminate redundant instuctions and nops.
7. Eliminate assembler warnings.
This commit is contained in:
Andy Polyakov 2005-07-18 17:11:13 +00:00
parent 02703c74a4
commit 4ac210c16a

View File

@ -194,21 +194,13 @@ $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
sub I { sub I {
local *code = shift; local *code = shift;
local $format = shift; local $format = shift;
local $a0 = shift; $code .= sprintf ("\t\t".$format."\n", @_);
local $a1 = shift;
local $a2 = shift;
local $a3 = shift;
$code .= sprintf ("\t\t".$format."\n", $a0, $a1, $a2, $a3);
} }
sub P { sub P {
local *code = shift; local *code = shift;
local $format = shift; local $format = shift;
local $a0 = shift; $code .= sprintf ($format."\n", @_);
local $a1 = shift;
local $a2 = shift;
local $a3 = shift;
$code .= sprintf ($format."\n", $a0, $a1, $a2, $a3);
} }
sub STOP { sub STOP {
@ -239,6 +231,10 @@ sub emit_body {
___ ___
if (($p & 0xf) == 0) { if (($p & 0xf) == 0) {
$c.="#ifdef RC4_BIG_ENDIAN\n";
&I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
$iw1 % $NOutWord, $iw1 % $NOutWord);
$c.="#endif\n";
&I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord); &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
return; return;
} }
@ -311,6 +307,7 @@ ___
&I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI); &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
&I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI); &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
&I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label); &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
&I(\$bypass, ";;");
} }
} }
@ -394,10 +391,11 @@ $code=<<___;
/* Define a macro for the bit number of the n-th byte: */ /* Define a macro for the bit number of the n-th byte: */
#ifdef L_ENDIAN #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
# define BYTE_POS(n) (8 * (n)) # define RC4_BIG_ENDIAN
#else
# define BYTE_POS(n) (56 - (8 * (n))) # define BYTE_POS(n) (56 - (8 * (n)))
#else
# define BYTE_POS(n) (8 * (n))
#endif #endif
/* /*
@ -406,8 +404,9 @@ $code=<<___;
will never be taken since regardless of the number of bytes because will never be taken since regardless of the number of bytes because
the epilogue count is 4. the epilogue count is 4.
*/ */
/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
#define MODSCHED_RC4(label) \\ assembler failed on original macro with syntax error. <appro> */
#define MODSCHED_RC4_PROLOGUE \\
{ \\ { \\
ld1 Data[0] = [InPtr], 1; \\ ld1 Data[0] = [InPtr], 1; \\
add IFinal = 1, I[1]; \\ add IFinal = 1, I[1]; \\
@ -421,8 +420,9 @@ $code=<<___;
{ \\ { \\
add J = J, SI[0]; \\ add J = J, SI[0]; \\
zxt1 I[0] = IFinal; \\ zxt1 I[0] = IFinal; \\
br.cexit.spnt.few label; /* never taken */ \\ br.cexit.spnt.few .+16; /* never taken */ \\
} ;; \\ } ;;
#define MODSCHED_RC4_LOOP(label) \\
label: \\ label: \\
{ .mmi; \\ { .mmi; \\
(pComputeI) ld1 Data[0] = [InPtr], 1; \\ (pComputeI) ld1 Data[0] = [InPtr], 1; \\
@ -476,63 +476,42 @@ RC4:
OutWord[2] OutWord[2]
.rotp pPhase[4] .rotp pPhase[4]
#ifdef _LP64
add InPrefetch = 0, InputBuffer
nop 0x0
}
#else
ADDP InputBuffer = 0, InputBuffer
ADDP StateTable = 0, StateTable
}
;;
{
ADDP InPrefetch = 0, InputBuffer ADDP InPrefetch = 0, InputBuffer
ADDP OutputBuffer = 0, OutputBuffer ADDP KTable = 0, StateTable
nop 0x0 }
{
.mmi
ADDP InPtr = 0, InputBuffer
ADDP OutPtr = 0, OutputBuffer
mov RetVal = r0
} }
#endif
;; ;;
{ {
.mmi .mmi
lfetch.nt1 [InPrefetch], 0x80 lfetch.nt1 [InPrefetch], 0x80
LKEY I[1] = [StateTable], SZ ADDP OutPrefetch = 0, OutputBuffer
mov OutPrefetch = OutputBuffer
} ;;
{
.mii
nop 0x0
nop 0x0
mov RetVal = r0
} }
{ // Return 0 if the input length is nonsensical { // Return 0 if the input length is nonsensical
.mib .mib
nop 0x0 ADDP StateTable = 0, StateTable
cmp.ge L_NOK, L_OK = r0, DataLen cmp.ge.unc L_NOK, L_OK = r0, DataLen
(L_NOK) br.ret.sptk.few rp (L_NOK) br.ret.sptk.few rp
} }
;; ;;
{ {
.mib .mib
cmp.eq.or L_NOK, L_OK = r0, InPtr
cmp.eq.or L_NOK, L_OK = r0, OutPtr
nop 0x0
}
{
.mib
cmp.eq.or L_NOK, L_OK = r0, StateTable
nop 0x0 nop 0x0
cmp.eq L_NOK, L_OK = r0, InputBuffer
(L_NOK) br.ret.sptk.few rp (L_NOK) br.ret.sptk.few rp
} }
;; ;;
{ LKEY I[1] = [KTable], SZ
.mib
nop 0x0
cmp.eq L_NOK, L_OK = r0, OutputBuffer
(L_NOK) br.ret.sptk.few rp
}
;;
{
.mib
nop 0x0
cmp.eq L_NOK, L_OK = r0, StateTable
(L_NOK) br.ret.sptk.few rp
}
/* Prefetch the state-table. It contains 256 elements of size SZ */ /* Prefetch the state-table. It contains 256 elements of size SZ */
#if SZ == 1 #if SZ == 1
@ -568,8 +547,12 @@ RC4:
lfetch.fault.nt1 [tmp0], -256 // 3 lfetch.fault.nt1 [tmp0], -256 // 3
lfetch.fault.nt1 [tmp1], -256;; lfetch.fault.nt1 [tmp1], -256;;
#endif #endif
{
.mii
lfetch.fault.nt1 [tmp0] // 1 lfetch.fault.nt1 [tmp0] // 1
add I[1]=1,I[1];;
zxt1 I[1]=I[1]
}
{ {
.mmi .mmi
lfetch.nt1 [InPrefetch], 0x80 lfetch.nt1 [InPrefetch], 0x80
@ -580,19 +563,13 @@ RC4:
{ {
.mmi .mmi
lfetch.excl.nt1 [OutPrefetch], 0x80 lfetch.excl.nt1 [OutPrefetch], 0x80
LKEY J = [StateTable], SZ LKEY J = [KTable], SZ
ADDP EndPtr = DataLen, InputBuffer ADDP EndPtr = DataLen, InPtr
} ;; } ;;
{ {
.mmi .mmi
mov InPtr = InputBuffer
mov OutPtr = OutputBuffer
ADDP EndPtr = -1, EndPtr // Make it point to ADDP EndPtr = -1, EndPtr // Make it point to
// last data byte. // last data byte.
} ;;
{
.mii
mov KTable = StateTable
mov One = 1 mov One = 1
.save ar.lc, LCSave .save ar.lc, LCSave
mov LCSave = ar.lc mov LCSave = ar.lc
@ -614,6 +591,7 @@ RC4:
} ;; } ;;
{ {
.mmb .mmb
.pred.rel "mutex",pUnaligned,pAligned
(pUnaligned) add Remainder = -1, Remainder (pUnaligned) add Remainder = -1, Remainder
(pAligned) sub Remainder = EndPtr, InPtr (pAligned) sub Remainder = EndPtr, InPtr
(pAligned) br.cond.dptk.many .rc4Aligned (pAligned) br.cond.dptk.many .rc4Aligned
@ -628,7 +606,8 @@ RC4:
/* Do the initial few bytes via the compact, modulo-scheduled loop /* Do the initial few bytes via the compact, modulo-scheduled loop
until the output pointer is 8-byte-aligned. */ until the output pointer is 8-byte-aligned. */
MODSCHED_RC4(.RC4AlignLoop) MODSCHED_RC4_PROLOGUE
MODSCHED_RC4_LOOP(.RC4AlignLoop)
{ {
.mib .mib
@ -671,13 +650,7 @@ RC4:
} ;; } ;;
{ {
.mmi .mmi
getf.sig LoopCount = f6 // M2 5 cyc getf.sig LoopCount = f6;; // M2 5 cyc
nop 0x0
nop 0x0
} ;;
{
.mmi
nop 0x0
nop 0x0 nop 0x0
shr.u LoopCount = LoopCount, 4 shr.u LoopCount = LoopCount, 4
} ;; } ;;
@ -747,32 +720,26 @@ $code.=<<___;
/* Do the remaining bytes via the compact, modulo-scheduled loop */ /* Do the remaining bytes via the compact, modulo-scheduled loop */
MODSCHED_RC4(.RC4RestLoop) MODSCHED_RC4_PROLOGUE
MODSCHED_RC4_LOOP(.RC4RestLoop)
{
.mmi
nop 0x0
nop 0x0
zxt1 IFinal = IFinal
} ;;
.rc4Complete: .rc4Complete:
{ {
.mmi .mmi
ADDP KTable = -2*SZ, KTable ;; add KTable = -SZ, KTable
SKEY [KTable] = IFinal, SZ add IFinal = -1, IFinal
mov ar.lc = LCSave mov ar.lc = LCSave
} ;; } ;;
{ {
.mii .mii
nop 0x0 SKEY [KTable] = J,-SZ
nop 0x0 zxt1 IFinal = IFinal
add RetVal = 1, r0 mov pr = PRSave, 0x1FFFF
} } ;;
{ {
.mib .mib
SKEY [KTable] = J SKEY [KTable] = IFinal
mov pr = PRSave, 0x1FFFF add RetVal = 1, r0
br.ret.sptk.few rp br.ret.sptk.few rp
} ;; } ;;
___ ___