This update implements following improvements.
1. Original submission required minor modification to RC4_set_key, which we don't want to tolerate and therefore we fix assembler instead. 2. Eliminate remaining byte-order dependence [look for RC4_BIG_ENDIAN]. 3. Eliminate logical error [when key->x is referred prior key is verified]. 4. HP-UX assembler puked on MODSCHED_RC4 macro with "syntax error," macro has to be splitted in two. 5. Deploy parallel compare in function prologue. 6. Eliminate redundant instuctions and nops. 7. Eliminate assembler warnings.
This commit is contained in:
parent
02703c74a4
commit
4ac210c16a
@ -194,21 +194,13 @@ $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
|
|||||||
sub I {
|
sub I {
|
||||||
local *code = shift;
|
local *code = shift;
|
||||||
local $format = shift;
|
local $format = shift;
|
||||||
local $a0 = shift;
|
$code .= sprintf ("\t\t".$format."\n", @_);
|
||||||
local $a1 = shift;
|
|
||||||
local $a2 = shift;
|
|
||||||
local $a3 = shift;
|
|
||||||
$code .= sprintf ("\t\t".$format."\n", $a0, $a1, $a2, $a3);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sub P {
|
sub P {
|
||||||
local *code = shift;
|
local *code = shift;
|
||||||
local $format = shift;
|
local $format = shift;
|
||||||
local $a0 = shift;
|
$code .= sprintf ($format."\n", @_);
|
||||||
local $a1 = shift;
|
|
||||||
local $a2 = shift;
|
|
||||||
local $a3 = shift;
|
|
||||||
$code .= sprintf ($format."\n", $a0, $a1, $a2, $a3);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sub STOP {
|
sub STOP {
|
||||||
@ -239,6 +231,10 @@ sub emit_body {
|
|||||||
___
|
___
|
||||||
|
|
||||||
if (($p & 0xf) == 0) {
|
if (($p & 0xf) == 0) {
|
||||||
|
$c.="#ifdef RC4_BIG_ENDIAN\n";
|
||||||
|
&I(\$c,"shr.u OutWord[%u] = OutWord[%u], 32;;",
|
||||||
|
$iw1 % $NOutWord, $iw1 % $NOutWord);
|
||||||
|
$c.="#endif\n";
|
||||||
&I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
|
&I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -311,6 +307,7 @@ ___
|
|||||||
&I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
|
&I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
|
||||||
&I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
|
&I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
|
||||||
&I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
|
&I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
|
||||||
|
&I(\$bypass, ";;");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -394,10 +391,11 @@ $code=<<___;
|
|||||||
|
|
||||||
/* Define a macro for the bit number of the n-th byte: */
|
/* Define a macro for the bit number of the n-th byte: */
|
||||||
|
|
||||||
#ifdef L_ENDIAN
|
#if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
|
||||||
# define BYTE_POS(n) (8 * (n))
|
# define RC4_BIG_ENDIAN
|
||||||
#else
|
|
||||||
# define BYTE_POS(n) (56 - (8 * (n)))
|
# define BYTE_POS(n) (56 - (8 * (n)))
|
||||||
|
#else
|
||||||
|
# define BYTE_POS(n) (8 * (n))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -406,8 +404,9 @@ $code=<<___;
|
|||||||
will never be taken since regardless of the number of bytes because
|
will never be taken since regardless of the number of bytes because
|
||||||
the epilogue count is 4.
|
the epilogue count is 4.
|
||||||
*/
|
*/
|
||||||
|
/* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
|
||||||
#define MODSCHED_RC4(label) \\
|
assembler failed on original macro with syntax error. <appro> */
|
||||||
|
#define MODSCHED_RC4_PROLOGUE \\
|
||||||
{ \\
|
{ \\
|
||||||
ld1 Data[0] = [InPtr], 1; \\
|
ld1 Data[0] = [InPtr], 1; \\
|
||||||
add IFinal = 1, I[1]; \\
|
add IFinal = 1, I[1]; \\
|
||||||
@ -421,8 +420,9 @@ $code=<<___;
|
|||||||
{ \\
|
{ \\
|
||||||
add J = J, SI[0]; \\
|
add J = J, SI[0]; \\
|
||||||
zxt1 I[0] = IFinal; \\
|
zxt1 I[0] = IFinal; \\
|
||||||
br.cexit.spnt.few label; /* never taken */ \\
|
br.cexit.spnt.few .+16; /* never taken */ \\
|
||||||
} ;; \\
|
} ;;
|
||||||
|
#define MODSCHED_RC4_LOOP(label) \\
|
||||||
label: \\
|
label: \\
|
||||||
{ .mmi; \\
|
{ .mmi; \\
|
||||||
(pComputeI) ld1 Data[0] = [InPtr], 1; \\
|
(pComputeI) ld1 Data[0] = [InPtr], 1; \\
|
||||||
@ -476,63 +476,42 @@ RC4:
|
|||||||
OutWord[2]
|
OutWord[2]
|
||||||
.rotp pPhase[4]
|
.rotp pPhase[4]
|
||||||
|
|
||||||
#ifdef _LP64
|
|
||||||
add InPrefetch = 0, InputBuffer
|
|
||||||
nop 0x0
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
ADDP InputBuffer = 0, InputBuffer
|
|
||||||
ADDP StateTable = 0, StateTable
|
|
||||||
}
|
|
||||||
;;
|
|
||||||
{
|
|
||||||
ADDP InPrefetch = 0, InputBuffer
|
ADDP InPrefetch = 0, InputBuffer
|
||||||
ADDP OutputBuffer = 0, OutputBuffer
|
ADDP KTable = 0, StateTable
|
||||||
nop 0x0
|
}
|
||||||
|
{
|
||||||
|
.mmi
|
||||||
|
ADDP InPtr = 0, InputBuffer
|
||||||
|
ADDP OutPtr = 0, OutputBuffer
|
||||||
|
mov RetVal = r0
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
;;
|
;;
|
||||||
{
|
{
|
||||||
.mmi
|
.mmi
|
||||||
lfetch.nt1 [InPrefetch], 0x80
|
lfetch.nt1 [InPrefetch], 0x80
|
||||||
LKEY I[1] = [StateTable], SZ
|
ADDP OutPrefetch = 0, OutputBuffer
|
||||||
mov OutPrefetch = OutputBuffer
|
|
||||||
} ;;
|
|
||||||
{
|
|
||||||
.mii
|
|
||||||
nop 0x0
|
|
||||||
nop 0x0
|
|
||||||
mov RetVal = r0
|
|
||||||
}
|
}
|
||||||
{ // Return 0 if the input length is nonsensical
|
{ // Return 0 if the input length is nonsensical
|
||||||
.mib
|
.mib
|
||||||
nop 0x0
|
ADDP StateTable = 0, StateTable
|
||||||
cmp.ge L_NOK, L_OK = r0, DataLen
|
cmp.ge.unc L_NOK, L_OK = r0, DataLen
|
||||||
(L_NOK) br.ret.sptk.few rp
|
(L_NOK) br.ret.sptk.few rp
|
||||||
}
|
}
|
||||||
;;
|
;;
|
||||||
{
|
{
|
||||||
.mib
|
.mib
|
||||||
|
cmp.eq.or L_NOK, L_OK = r0, InPtr
|
||||||
|
cmp.eq.or L_NOK, L_OK = r0, OutPtr
|
||||||
|
nop 0x0
|
||||||
|
}
|
||||||
|
{
|
||||||
|
.mib
|
||||||
|
cmp.eq.or L_NOK, L_OK = r0, StateTable
|
||||||
nop 0x0
|
nop 0x0
|
||||||
cmp.eq L_NOK, L_OK = r0, InputBuffer
|
|
||||||
(L_NOK) br.ret.sptk.few rp
|
(L_NOK) br.ret.sptk.few rp
|
||||||
}
|
}
|
||||||
;;
|
;;
|
||||||
{
|
LKEY I[1] = [KTable], SZ
|
||||||
.mib
|
|
||||||
nop 0x0
|
|
||||||
cmp.eq L_NOK, L_OK = r0, OutputBuffer
|
|
||||||
(L_NOK) br.ret.sptk.few rp
|
|
||||||
}
|
|
||||||
;;
|
|
||||||
{
|
|
||||||
.mib
|
|
||||||
nop 0x0
|
|
||||||
cmp.eq L_NOK, L_OK = r0, StateTable
|
|
||||||
(L_NOK) br.ret.sptk.few rp
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* Prefetch the state-table. It contains 256 elements of size SZ */
|
/* Prefetch the state-table. It contains 256 elements of size SZ */
|
||||||
|
|
||||||
#if SZ == 1
|
#if SZ == 1
|
||||||
@ -568,8 +547,12 @@ RC4:
|
|||||||
lfetch.fault.nt1 [tmp0], -256 // 3
|
lfetch.fault.nt1 [tmp0], -256 // 3
|
||||||
lfetch.fault.nt1 [tmp1], -256;;
|
lfetch.fault.nt1 [tmp1], -256;;
|
||||||
#endif
|
#endif
|
||||||
|
{
|
||||||
|
.mii
|
||||||
lfetch.fault.nt1 [tmp0] // 1
|
lfetch.fault.nt1 [tmp0] // 1
|
||||||
|
add I[1]=1,I[1];;
|
||||||
|
zxt1 I[1]=I[1]
|
||||||
|
}
|
||||||
{
|
{
|
||||||
.mmi
|
.mmi
|
||||||
lfetch.nt1 [InPrefetch], 0x80
|
lfetch.nt1 [InPrefetch], 0x80
|
||||||
@ -580,19 +563,13 @@ RC4:
|
|||||||
{
|
{
|
||||||
.mmi
|
.mmi
|
||||||
lfetch.excl.nt1 [OutPrefetch], 0x80
|
lfetch.excl.nt1 [OutPrefetch], 0x80
|
||||||
LKEY J = [StateTable], SZ
|
LKEY J = [KTable], SZ
|
||||||
ADDP EndPtr = DataLen, InputBuffer
|
ADDP EndPtr = DataLen, InPtr
|
||||||
} ;;
|
} ;;
|
||||||
{
|
{
|
||||||
.mmi
|
.mmi
|
||||||
mov InPtr = InputBuffer
|
|
||||||
mov OutPtr = OutputBuffer
|
|
||||||
ADDP EndPtr = -1, EndPtr // Make it point to
|
ADDP EndPtr = -1, EndPtr // Make it point to
|
||||||
// last data byte.
|
// last data byte.
|
||||||
} ;;
|
|
||||||
{
|
|
||||||
.mii
|
|
||||||
mov KTable = StateTable
|
|
||||||
mov One = 1
|
mov One = 1
|
||||||
.save ar.lc, LCSave
|
.save ar.lc, LCSave
|
||||||
mov LCSave = ar.lc
|
mov LCSave = ar.lc
|
||||||
@ -614,6 +591,7 @@ RC4:
|
|||||||
} ;;
|
} ;;
|
||||||
{
|
{
|
||||||
.mmb
|
.mmb
|
||||||
|
.pred.rel "mutex",pUnaligned,pAligned
|
||||||
(pUnaligned) add Remainder = -1, Remainder
|
(pUnaligned) add Remainder = -1, Remainder
|
||||||
(pAligned) sub Remainder = EndPtr, InPtr
|
(pAligned) sub Remainder = EndPtr, InPtr
|
||||||
(pAligned) br.cond.dptk.many .rc4Aligned
|
(pAligned) br.cond.dptk.many .rc4Aligned
|
||||||
@ -628,7 +606,8 @@ RC4:
|
|||||||
/* Do the initial few bytes via the compact, modulo-scheduled loop
|
/* Do the initial few bytes via the compact, modulo-scheduled loop
|
||||||
until the output pointer is 8-byte-aligned. */
|
until the output pointer is 8-byte-aligned. */
|
||||||
|
|
||||||
MODSCHED_RC4(.RC4AlignLoop)
|
MODSCHED_RC4_PROLOGUE
|
||||||
|
MODSCHED_RC4_LOOP(.RC4AlignLoop)
|
||||||
|
|
||||||
{
|
{
|
||||||
.mib
|
.mib
|
||||||
@ -671,13 +650,7 @@ RC4:
|
|||||||
} ;;
|
} ;;
|
||||||
{
|
{
|
||||||
.mmi
|
.mmi
|
||||||
getf.sig LoopCount = f6 // M2 5 cyc
|
getf.sig LoopCount = f6;; // M2 5 cyc
|
||||||
nop 0x0
|
|
||||||
nop 0x0
|
|
||||||
} ;;
|
|
||||||
{
|
|
||||||
.mmi
|
|
||||||
nop 0x0
|
|
||||||
nop 0x0
|
nop 0x0
|
||||||
shr.u LoopCount = LoopCount, 4
|
shr.u LoopCount = LoopCount, 4
|
||||||
} ;;
|
} ;;
|
||||||
@ -747,32 +720,26 @@ $code.=<<___;
|
|||||||
|
|
||||||
/* Do the remaining bytes via the compact, modulo-scheduled loop */
|
/* Do the remaining bytes via the compact, modulo-scheduled loop */
|
||||||
|
|
||||||
MODSCHED_RC4(.RC4RestLoop)
|
MODSCHED_RC4_PROLOGUE
|
||||||
|
MODSCHED_RC4_LOOP(.RC4RestLoop)
|
||||||
{
|
|
||||||
.mmi
|
|
||||||
nop 0x0
|
|
||||||
nop 0x0
|
|
||||||
zxt1 IFinal = IFinal
|
|
||||||
} ;;
|
|
||||||
|
|
||||||
.rc4Complete:
|
.rc4Complete:
|
||||||
{
|
{
|
||||||
.mmi
|
.mmi
|
||||||
ADDP KTable = -2*SZ, KTable ;;
|
add KTable = -SZ, KTable
|
||||||
SKEY [KTable] = IFinal, SZ
|
add IFinal = -1, IFinal
|
||||||
mov ar.lc = LCSave
|
mov ar.lc = LCSave
|
||||||
} ;;
|
} ;;
|
||||||
{
|
{
|
||||||
.mii
|
.mii
|
||||||
nop 0x0
|
SKEY [KTable] = J,-SZ
|
||||||
nop 0x0
|
zxt1 IFinal = IFinal
|
||||||
add RetVal = 1, r0
|
mov pr = PRSave, 0x1FFFF
|
||||||
}
|
} ;;
|
||||||
{
|
{
|
||||||
.mib
|
.mib
|
||||||
SKEY [KTable] = J
|
SKEY [KTable] = IFinal
|
||||||
mov pr = PRSave, 0x1FFFF
|
add RetVal = 1, r0
|
||||||
br.ret.sptk.few rp
|
br.ret.sptk.few rp
|
||||||
} ;;
|
} ;;
|
||||||
___
|
___
|
||||||
|
Loading…
x
Reference in New Issue
Block a user