diff --git a/crypto/des/asm/des_enc.m4 b/crypto/des/asm/des_enc.m4 new file mode 100644 index 000000000..8db30f33c --- /dev/null +++ b/crypto/des/asm/des_enc.m4 @@ -0,0 +1,1815 @@ +! des_enc.m4 +! des_enc.S (generated from des_enc.m4) +! +! UltraSPARC assembler version of the LibDES/SSLeay/OpenSSL des_enc.c file. +! +! Version 1.0. 32-bit version. +! +! June 8, 2000. +! +! Assembler version: Copyright Svend Olaf Mikkelsen. +! +! Original C code: Copyright Eric A. Young. +! +! This code can be freely used by LibDES/SSLeay/OpenSSL users. +! +! The LibDES/SSLeay/OpenSSL copyright notices must be respected. +! +! This version can be redistributed. +! +! To expand the m4 macros: m4 -B 8192 des_enc.m4 > des_enc.S +! +! Global registers 1 to 5 are used. This is the same as done by the +! cc compiler. The UltraSPARC load/store little endian feature is used. +! +! Instruction grouping often refers to one CPU cycle. +! +! Assemble through gcc: gcc -c -mcpu=ultrasparc -o des_enc.o des_enc.S +! +! Assemble through cc: cc -c -xarch=v8plusa -o des_enc.o des_enc.S + + +.ident "des_enc.m4 1.0" + +#define LOOPS 7 + +#define global0 %g0 +#define global1 %g1 +#define global2 %g2 +#define global3 %g3 +#define global4 %g4 +#define global5 %g5 + +#define local0 %l0 +#define local1 %l1 +#define local2 %l2 +#define local3 %l3 +#define local4 %l4 +#define local5 %l5 +#define local7 %l6 +#define local6 %l7 + +#define in0 %i0 +#define in1 %i1 +#define in2 %i2 +#define in3 %i3 +#define in4 %i4 +#define in5 %i5 +#define in6 %i6 +#define in7 %i7 + +#define out0 %o0 +#define out1 %o1 +#define out2 %o2 +#define out3 %o3 +#define out4 %o4 +#define out5 %o5 +#define out6 %o6 +#define out7 %o7 + + +changequote({,}) + + +! Macro definitions: + + +! {ip_macro} +! +! The logic used in initial and final permutations is the same as in +! the C code. The permutations are done with a clever shift, xor, and +! technique. +! +! The macro also loads address sbox 1 to 5 to global 1 to 5, address +! sbox 6 to local6, and addres sbox 8 to out3. +! +! Rotates the halfs 3 left to bring the sbox bits in convenient positions. +! +! Loads key first round from address in parameter 5 to out0, out1. +! +! After the the original LibDES initial permutation, the resulting left +! is in the variable initially used for right and vice versa. The macro +! implements the possibility to keep the halfs in the original registers. +! +! parameter 1 left +! parameter 2 right +! parameter 3 result left (modify in first round) +! parameter 4 result right (use in first round) +! parameter 5 key address +! parameter 6 1/2 for include encryption/decryption +! parameter 7 1 for move in1 to in3 +! parameter 8 1 for move in3 to in4, 2 for move in4 to in3 +! parameter 9 1 for load ks3 and ks2 to in4 and in3 + +define(ip_macro, { + +! {ip_macro} +! $1 $2 $4 $3 $5 $6 $7 $8 $9 + + ld [out2+256], local1 + srl $2, 4, local4 + + xor local4, $1, local4 + ifelse($7,1,{mov in1, in3},{nop}) + + ld [out2+260], local2 + and local4, local1, local4 + ifelse($8,1,{mov in3, in4},{}) + ifelse($8,2,{mov in4, in3},{}) + + ld [out2+280], out4 ! loop counter + sll local4, 4, local1 + xor $1, local4, $1 + + ld [out2+264], local3 + srl $1, 16, local4 + xor $2, local1, $2 + + ifelse($9,1,{ld KS3, in4},{}) + xor local4, $2, local4 + sethi %hi(des_SPtrans), global1 ! sbox addr + + ifelse($9,1,{ld KS2, in3},{}) + and local4, local2, local4 + or global1, %lo(des_SPtrans), global1 ! sbox addr + + sll local4, 16, local1 + xor $2, local4, $2 + + srl $2, 2, local4 + xor $1, local1, $1 + + sethi %hi(16711680), local5 + xor local4, $1, local4 + + and local4, local3, local4 + or local5, 255, local5 + + sll local4, 2, local2 + xor $1, local4, $1 + + srl $1, 8, local4 + xor $2, local2, $2 + + xor local4, $2, local4 + add global1, 768, global4 + + and local4, local5, local4 + add global1, 1024, global5 + + ld [out2+272], local7 + sll local4, 8, local1 + xor $2, local4, $2 + + srl $2, 1, local4 + xor $1, local1, $1 + + ld [$5], out0 ! key 7531 + xor local4, $1, local4 + add global1, 256, global2 + + ld [$5+4], out1 ! key 8642 + and local4, local7, local4 + add global1, 512, global3 + + sll local4, 1, local1 + xor $1, local4, $1 + + sll $1, 3, local3 + xor $2, local1, $2 + + sll $2, 3, local2 + add global1, 1280, local6 ! address sbox 8 + + srl $1, 29, local4 + add global1, 1792, out3 ! address sbox 8 + + srl $2, 29, local1 + or local4, local3, $4 + + or local2, local1, $3 + + ifelse($6, 1, { + + ld [out2+284], local5 ! 0x0000FC00 used in the rounds + or local2, local1, $3 + xor $4, out0, local1 + + call .des_enc.1 + and local1, 252, local1 + + },{}) + + ifelse($6, 2, { + + ld [out2+284], local5 ! 0x0000FC00 used in the rounds + or local2, local1, $3 + xor $4, out0, local1 + + call .des_dec.1 + and local1, 252, local1 + + },{}) +}) + + +! {rounds_macro} +! +! The logic used in the DES rounds is the same as in the C code, +! except that calculations for sbox 1 and sbox 5 begin before +! the previous round is finished. +! +! In each round one half (work) is modified based on key and the +! other half (use). +! +! In this version we do two rounds in a loop repeated 7 times +! and two rounds seperately. +! +! One half has the bits for the sboxes in the following positions: +! +! 777777xx555555xx333333xx111111xx +! +! 88xx666666xx444444xx222222xx8888 +! +! The bits for each sbox are xor-ed with the key bits for that box. +! The above xx bits are cleared, and the result used for lookup in +! the sbox table. Each sbox entry contains the 4 output bits permuted +! into 32 bits according to the P permutation. +! +! In the description of DES, left and right are switched after +! each round, except after last round. In this code the original +! left and right are kept in the same register in all rounds, meaning +! that after the 16 rounds the result for right is in the register +! originally used for left. +! +! parameter 1 first work (left in first round) +! parameter 2 first use (right in first round) +! parameter 3 enc/dec 1/-1 +! parameter 4 loop label +! parameter 5 key address register +! parameter 6 optional address for key next encryption/decryption +! parameter 7 not empty for include retl +! +! also compares in2 to 8 + +define(rounds_macro, { + +! {rounds_macro} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + xor $2, out0, local1 + + ld [out2+284], local5 ! 0x0000FC00 + ba,pt %icc, $4 + and local1, 252, local1 + + .align 32 + +$4: + ! local6 is address sbox 6 + ! out3 is address sbox 8 + ! out4 is loop counter + + ld [global1+local1], local1 + xor $2, out1, out1 ! 8642 + xor $2, out0, out0 ! 7531 + fxor %f0, %f0, %f0 ! fxor used for alignment + + srl out1, 4, local0 ! rotate 4 right + and out0, local5, local3 ! 3 + fxor %f0, %f0, %f0 + + ld [$5+$3*8], local7 ! key 7531 next round + srl local3, 8, local3 ! 3 + and local0, 252, local2 ! 2 + fxor %f0, %f0, %f0 + + ld [global3+local3],local3 ! 3 + sll out1, 28, out1 ! rotate + xor $1, local1, $1 ! 1 finished, local1 now sbox 7 + + ld [global2+local2], local2 ! 2 + srl out0, 24, local1 ! 7 + or out1, local0, out1 ! rotate + + ldub [out2+local1], local1 ! 7 (and 0xFC) + srl out1, 24, local0 ! 8 + and out1, local5, local4 ! 4 + + ldub [out2+local0], local0 ! 8 (and 0xFC) + srl local4, 8, local4 ! 4 + xor $1, local2, $1 ! 2 finished local2 now sbox 6 + + ld [global4+local4],local4 ! 4 + srl out1, 16, local2 ! 6 + xor $1, local3, $1 ! 3 finished local3 now sbox 5 + + ld [out3+local0],local0 ! 8 + and local2, 252, local2 ! 6 + add global1, 1536, local5 ! address sbox 7 + + ld [local6+local2], local2 ! 6 + srl out0, 16, local3 ! 5 + xor $1, local4, $1 ! 4 finished + + ld [local5+local1],local1 ! 7 + and local3, 252, local3 ! 5 + xor $1, local0, $1 ! 8 finished + + ld [global5+local3],local3 ! 5 + xor $1, local2, $1 ! 6 finished + subcc out4, 1, out4 + + ld [$5+$3*8+4], out0 ! key 8642 next round + xor $1, local7, local2 ! sbox 5 next round + xor $1, local1, $1 ! 7 finished + + srl local2, 16, local2 ! sbox 5 next round + xor $1, local3, $1 ! 5 finished + + ld [$5+$3*16+4], out1 ! key 8642 next round again + and local2, 252, local2 ! sbox5 next round +! next round + xor $1, local7, local7 ! 7531 + + ld [global5+local2], local2 ! 5 + srl local7, 24, local3 ! 7 + xor $1, out0, out0 ! 8642 + + ldub [out2+local3], local3 ! 7 (and 0xFC) + srl out0, 4, local0 ! rotate 4 right + and local7, 252, local1 ! 1 + + sll out0, 28, out0 ! rotate + xor $2, local2, $2 ! 5 finished local2 used + + srl local0, 8, local4 ! 4 + and local0, 252, local2 ! 2 + ld [local5+local3], local3 ! 7 + + srl local0, 16, local5 ! 6 + or out0, local0, out0 ! rotate + ld [global2+local2], local2 ! 2 + + srl out0, 24, local0 + ld [$5+$3*16], out0 ! key 7531 next round + and local4, 252, local4 ! 4 + + and local5, 252, local5 ! 6 + ld [global4+local4], local4 ! 4 + xor $2, local3, $2 ! 7 finished local3 used + + and local0, 252, local0 ! 8 + ld [local6+local5], local5 ! 6 + xor $2, local2, $2 ! 2 finished local2 now sbox 3 + + srl local7, 8, local2 ! 3 start + ld [out3+local0], local0 ! 8 + xor $2, local4, $2 ! 4 finished + + and local2, 252, local2 ! 3 + ld [global1+local1], local1 ! 1 + xor $2, local5, $2 ! 6 finished local5 used + + ld [global3+local2], local2 ! 3 + xor $2, local0, $2 ! 8 finished + add $5, $3*16, $5 ! enc add 8, dec add -8 to key pointer + + ld [out2+284], local5 ! 0x0000FC00 + xor $2, out0, local4 ! sbox 1 next round + xor $2, local1, $2 ! 1 finished + + xor $2, local2, $2 ! 3 finished + bne,pt %icc, $4 + and local4, 252, local1 ! sbox 1 next round + +! two rounds more: + + ld [global1+local1], local1 + xor $2, out1, out1 + xor $2, out0, out0 + + srl out1, 4, local0 ! rotate + and out0, local5, local3 + + ld [$5+$3*8], local7 ! key 7531 + srl local3, 8, local3 + and local0, 252, local2 + + ld [global3+local3],local3 + sll out1, 28, out1 ! rotate + xor $1, local1, $1 ! 1 finished, local1 now sbox 7 + + ld [global2+local2], local2 + srl out0, 24, local1 + or out1, local0, out1 ! rotate + + ldub [out2+local1], local1 + srl out1, 24, local0 + and out1, local5, local4 + + ldub [out2+local0], local0 + srl local4, 8, local4 + xor $1, local2, $1 ! 2 finished local2 now sbox 6 + + ld [global4+local4],local4 + srl out1, 16, local2 + xor $1, local3, $1 ! 3 finished local3 now sbox 5 + + ld [out3+local0],local0 + and local2, 252, local2 + add global1, 1536, local5 ! address sbox 7 + + ld [local6+local2], local2 + srl out0, 16, local3 + xor $1, local4, $1 ! 4 finished + + ld [local5+local1],local1 + and local3, 252, local3 + xor $1, local0, $1 + + ld [global5+local3],local3 + xor $1, local2, $1 ! 6 finished + cmp in2, 8 + + ifelse($6,{}, {}, {ld [out2+280], out4}) ! loop counter + xor $1, local7, local2 ! sbox 5 next round + xor $1, local1, $1 ! 7 finished + + ld [$5+$3*8+4], out0 + srl local2, 16, local2 ! sbox 5 next round + xor $1, local3, $1 ! 5 finished + + and local2, 252, local2 +! next round (two rounds more) + xor $1, local7, local7 ! 7531 + + ld [global5+local2], local2 + srl local7, 24, local3 + xor $1, out0, out0 ! 8642 + + ldub [out2+local3], local3 + srl out0, 4, local0 ! rotate + and local7, 252, local1 + + sll out0, 28, out0 ! rotate + xor $2, local2, $2 ! 5 finished local2 used + + srl local0, 8, local4 + and local0, 252, local2 + ld [local5+local3], local3 + + srl local0, 16, local5 + or out0, local0, out0 ! rotate + ld [global2+local2], local2 + + srl out0, 24, local0 + ifelse($6,{}, {}, {ld [$6], out0}) ! key next encryption/decryption + and local4, 252, local4 + + and local5, 252, local5 + ld [global4+local4], local4 + xor $2, local3, $2 ! 7 finished local3 used + + and local0, 252, local0 + ld [local6+local5], local5 + xor $2, local2, $2 ! 2 finished local2 now sbox 3 + + srl local7, 8, local2 ! 3 start + ld [out3+local0], local0 + xor $2, local4, $2 + + and local2, 252, local2 + ld [global1+local1], local1 + xor $2, local5, $2 ! 6 finished local5 used + + ld [global3+local2], local2 + srl $1, 3, local3 + xor $2, local0, $2 + + ifelse($6,{}, {}, {ld [$6+4], out1}) ! key next encryption/decryption + sll $1, 29, local4 + xor $2, local1, $2 + + ifelse($7,{}, {}, {retl}) + xor $2, local2, $2 +}) + + +! {fp_macro} +! +! parameter 1 right (original left) +! parameter 2 left (original right) +! parameter 3 1 for optional store to [in0] +! parameter 4 1 for load input/output address to local5/7 +! +! The final permutation logic switches the halfes, meaning that +! left and right ends up the the registers originally used. + +define(fp_macro, { + +! {fp_macro} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + ! initially undo the rotate 3 left done after initial permutation + ! original left is received shifted 3 right and 29 left in local3/4 + + sll $2, 29, local1 + or local3, local4, $1 + + srl $2, 3, $2 + sethi %hi(0x55555555), local2 + + or $2, local1, $2 + or local2, %lo(0x55555555), local2 + + srl $2, 1, local3 + sethi %hi(0x00ff00ff), local1 + xor local3, $1, local3 + or local1, %lo(0x00ff00ff), local1 + and local3, local2, local3 + sethi %hi(0x33333333), local4 + sll local3, 1, local2 + + xor $1, local3, $1 + + srl $1, 8, local3 + xor $2, local2, $2 + xor local3, $2, local3 + or local4, %lo(0x33333333), local4 + and local3, local1, local3 + sethi %hi(0x0000ffff), local1 + sll local3, 8, local2 + + xor $2, local3, $2 + + srl $2, 2, local3 + xor $1, local2, $1 + xor local3, $1, local3 + or local1, %lo(0x0000ffff), local1 + and local3, local4, local3 + sethi %hi(0x0f0f0f0f), local4 + sll local3, 2, local2 + + ifelse($4,1, {ld INPUT, local5}) + xor $1, local3, $1 + + ifelse($4,1, {ld OUTPUT, local7}) + srl $1, 16, local3 + xor $2, local2, $2 + xor local3, $2, local3 + or local4, %lo(0x0f0f0f0f), local4 + and local3, local1, local3 + sll local3, 16, local2 + + xor $2, local3, local1 + + srl local1, 4, local3 + xor $1, local2, $1 + xor local3, $1, local3 + and local3, local4, local3 + sll local3, 4, local2 + + xor $1, local3, $1 + + ! optional store: + + ifelse($3,1, {st $1, [in0]}) + + xor local1, local2, $2 + + ifelse($3,1, {st $2, [in0+4]}) + +}) + + +! {fp_ip_macro} +! +! Does initial permutation for next block mixed with +! final permutation for current block. +! +! parameter 1 original left +! parameter 2 original right +! parameter 3 left ip +! parameter 4 right ip +! parameter 5 1: load ks1/ks2 to in3/in4, add 120 to in4 +! 2: mov in4 to in3 +! +! also adds -8 to length in2 and loads loop counter to out4 + +define(fp_ip_macro, { + +! {fp_ip_macro} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + define({temp1},{out4}) + define({temp2},{local3}) + + define({ip1},{local1}) + define({ip2},{local2}) + define({ip4},{local4}) + define({ip5},{local5}) + + ! $1 in local3, local4 + + ld [out2+256], ip1 + sll out5, 29, temp1 + or local3, local4, $1 + + srl out5, 3, $2 + ifelse($5,2,{mov in4, in3}) + + ld [out2+272], ip5 + srl $4, 4, local0 + or $2, temp1, $2 + + srl $2, 1, temp1 + xor temp1, $1, temp1 + + and temp1, ip5, temp1 + xor local0, $3, local0 + + sll temp1, 1, temp2 + xor $1, temp1, $1 + + and local0, ip1, local0 + add in2, -8, in2 + + sll local0, 4, local7 + xor $3, local0, $3 + + ld [out2+268], ip4 + srl $1, 8, temp1 + xor $2, temp2, $2 + ld [out2+260], ip2 + srl $3, 16, local0 + xor $4, local7, $4 + xor temp1, $2, temp1 + xor local0, $4, local0 + and temp1, ip4, temp1 + and local0, ip2, local0 + sll temp1, 8, temp2 + xor $2, temp1, $2 + sll local0, 16, local7 + xor $4, local0, $4 + + srl $2, 2, temp1 + xor $1, temp2, $1 + + ld [out2+264], temp2 ! ip3 + srl $4, 2, local0 + xor $3, local7, $3 + xor temp1, $1, temp1 + xor local0, $3, local0 + and temp1, temp2, temp1 + and local0, temp2, local0 + sll temp1, 2, temp2 + xor $1, temp1, $1 + sll local0, 2, local7 + xor $3, local0, $3 + + srl $1, 16, temp1 + xor $2, temp2, $2 + srl $3, 8, local0 + xor $4, local7, $4 + xor temp1, $2, temp1 + xor local0, $4, local0 + and temp1, ip2, temp1 + and local0, ip4, local0 + sll temp1, 16, temp2 + xor $2, temp1, local4 + sll local0, 8, local7 + xor $4, local0, $4 + + srl $4, 1, local0 + xor $3, local7, $3 + + srl local4, 4, temp1 + xor local0, $3, local0 + + xor $1, temp2, $1 + and local0, ip5, local0 + + sll local0, 1, local7 + xor temp1, $1, temp1 + + xor $3, local0, $3 + xor $4, local7, $4 + + sll $3, 3, local5 + and temp1, ip1, temp1 + + sll temp1, 4, temp2 + xor $1, temp1, $1 + + ifelse($5,1,{ld KS2, in4}) + sll $4, 3, local2 + xor local4, temp2, $2 + + ! reload since used as temporar: + + ld [out2+280], out4 ! loop counter + + srl $3, 29, local0 + ifelse($5,1,{add in4, 120, in4}) + + ifelse($5,1,{ld KS1, in3}) + srl $4, 29, local7 + + or local0, local5, $4 + or local2, local7, $3 + +}) + + + +! {load_little_endian} +! +! parameter 1 address +! parameter 2 destination left +! parameter 3 destination right +! parameter 4 temporar +! parameter 5 label + +define(load_little_endian, { + +! {load_little_endian} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + ! first in memory to rightmost in register + + andcc $1, 3, global0 + bne,pn %icc, $5 + nop + + lda [$1] 0x88, $2 + add $1, 4, $4 + + ba,pt %icc, $5a + lda [$4] 0x88, $3 + +$5: + ldub [$1+3], $2 + + ldub [$1+2], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+1], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+0], $4 + sll $2, 8, $2 + or $2, $4, $2 + + + ldub [$1+3+4], $3 + + ldub [$1+2+4], $4 + sll $3, 8, $3 + or $3, $4, $3 + + ldub [$1+1+4], $4 + sll $3, 8, $3 + or $3, $4, $3 + + ldub [$1+0+4], $4 + sll $3, 8, $3 + or $3, $4, $3 +$5a: + +}) + + +! {load_little_endian_inc} +! +! parameter 1 address +! parameter 2 destination left +! parameter 3 destination right +! parameter 4 temporar +! parameter 4 label +! +! adds 8 to address + +define(load_little_endian_inc, { + +! {load_little_endian_inc} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + ! first in memory to rightmost in register + + andcc $1, 3, global0 + bne,pn %icc, $5 + nop + + lda [$1] 0x88, $2 + add $1, 4, $1 + + lda [$1] 0x88, $3 + ba,pt %icc, $5a + add $1, 4, $1 + +$5: + ldub [$1+3], $2 + + ldub [$1+2], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+1], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+0], $4 + sll $2, 8, $2 + or $2, $4, $2 + + ldub [$1+3+4], $3 + add $1, 8, $1 + + ldub [$1+2+4-8], $4 + sll $3, 8, $3 + or $3, $4, $3 + + ldub [$1+1+4-8], $4 + sll $3, 8, $3 + or $3, $4, $3 + + ldub [$1+0+4-8], $4 + sll $3, 8, $3 + or $3, $4, $3 +$5a: + +}) + + +! {load_n_bytes} +! +! Loads 1 to 7 bytes little endian +! Remaining bytes are zeroed. +! +! parameter 1 address +! parameter 2 length +! parameter 3 destination register left +! parameter 4 destination register right +! parameter 5 temp +! parameter 6 temp2 +! parameter 7 label +! parameter 8 return label + +define(load_n_bytes, { + +! {load_n_bytes} +! $1 $2 $5 $6 $7 $8 $7 $8 $9 + + sll $2, 2, $6 + + sethi %hi($7.jmp.table), $5 + or $5, %lo($7.jmp.table), $5 + + add $5, $6, $5 + mov 0, $4 + + ld [$5], $5 + + jmp $5 + mov 0, $3 + +$7.7: + ldub [$1+6], $5 + sll $5, 16, $5 + or $3, $5, $3 +$7.6: + ldub [$1+5], $5 + sll $5, 8, $5 + or $3, $5, $3 +$7.5: + ldub [$1+4], $5 + or $3, $5, $3 +$7.4: + ldub [$1+3], $5 + sll $5, 24, $5 + or $4, $5, $4 +$7.3: + ldub [$1+2], $5 + sll $5, 16, $5 + or $4, $5, $4 +$7.2: + ldub [$1+1], $5 + sll $5, 8, $5 + or $4, $5, $4 +$7.1: + ldub [$1+0], $5 + ba,pt %icc, $8 + or $4, $5, $4 + + .align 4 + +$7.jmp.table: + .word 0 + .word $7.1 + .word $7.2 + .word $7.3 + .word $7.4 + .word $7.5 + .word $7.6 + .word $7.7 +}) + + +! {store_little_endian} +! +! parameter 1 address +! parameter 2 source left +! parameter 3 source right +! parameter 4 temporar + +define(store_little_endian, { + +! {store_little_endian} +! $1 $2 $3 $4 $5 $6 $7 $8 $9 + + ! rightmost in register to first in memory + + andcc $1, 3, global0 + bne,pn %icc, $5 + nop + + sta $2, [$1] 0x88 + add $1, 4, $4 + + ba,pt %icc, $5a + sta $3, [$4] 0x88 + +$5: + and $2, 255, $4 + stub $4, [$1+0] + + srl $2, 8, $4 + and $4, 255, $4 + stub $4, [$1+1] + + srl $2, 16, $4 + and $4, 255, $4 + stub $4, [$1+2] + + srl $2, 24, $4 + stub $4, [$1+3] + + + and $3, 255, $4 + stub $4, [$1+0+4] + + srl $3, 8, $4 + and $4, 255, $4 + stub $4, [$1+1+4] + + srl $3, 16, $4 + and $4, 255, $4 + stub $4, [$1+2+4] + + srl $3, 24, $4 + stub $4, [$1+3+4] + +$5a: + +}) + + +! {store_n_bytes} +! +! Stores 1 to 7 bytes little endian +! +! parameter 1 address +! parameter 2 length +! parameter 3 source register left +! parameter 4 source register right +! parameter 5 temp +! parameter 6 temp2 +! parameter 7 label +! parameter 8 return label + +define(store_n_bytes, { + +! {store_n_bytes} +! $1 $2 $5 $6 $7 $8 $7 $8 $9 + + sll $2, 2, $6 + + sethi %hi($7.jmp.table), $5 + or $5, %lo($7.jmp.table), $5 + + add $5, $6, $5 + + ld [$5], $5 + jmp $5 + nop + +$7.7: + srl $3, 16, $5 + and $5, 0xff, $5 + stub $5, [$1+6] +$7.6: + srl $3, 8, $5 + and $5, 0xff, $5 + stub $5, [$1+5] +$7.5: + and $3, 0xff, $5 + stub $5, [$1+4] +$7.4: + srl $4, 24, $5 + stub $5, [$1+3] +$7.3: + srl $4, 16, $5 + and $5, 0xff, $5 + stub $5, [$1+2] +$7.2: + srl $4, 8, $5 + and $5, 0xff, $5 + stub $5, [$1+1] +$7.1: + and $4, 0xff, $5 + + + ba,pt %icc, $8 + stub $5, [$1] + + .align 4 + +$7.jmp.table: + + .word 0 + .word $7.1 + .word $7.2 + .word $7.3 + .word $7.4 + .word $7.5 + .word $7.6 + .word $7.7 +}) + + +define(testvalue,{1}) + +define(register_init, { + +! For test purposes: + + sethi %hi(testvalue), local0 + or local0, %lo(testvalue), local0 + + ifelse($1,{},{}, {mov local0, $1}) + ifelse($2,{},{}, {mov local0, $2}) + ifelse($3,{},{}, {mov local0, $3}) + ifelse($4,{},{}, {mov local0, $4}) + ifelse($5,{},{}, {mov local0, $5}) + ifelse($6,{},{}, {mov local0, $6}) + ifelse($7,{},{}, {mov local0, $7}) + ifelse($8,{},{}, {mov local0, $8}) + + mov local0, local1 + mov local0, local2 + mov local0, local3 + mov local0, local4 + mov local0, local5 + mov local0, local7 + mov local0, local6 + mov local0, out0 + mov local0, out1 + mov local0, out2 + mov local0, out3 + mov local0, out4 + mov local0, out5 + mov local0, global1 + mov local0, global2 + mov local0, global3 + mov local0, global4 + mov local0, global5 + +}) + + .global .des_and +.section ".rodata" + .align 8 + .type .des_and,#object + .size .des_and,284 + +.des_and: + +! This table is used for AND 0xFC when it is known that register +! bits 8-31 are zero. Makes it possible to do three arithmetic +! operations in one cycle. + + .byte 0, 0, 0, 0, 4, 4, 4, 4 + .byte 8, 8, 8, 8, 12, 12, 12, 12 + .byte 16, 16, 16, 16, 20, 20, 20, 20 + .byte 24, 24, 24, 24, 28, 28, 28, 28 + .byte 32, 32, 32, 32, 36, 36, 36, 36 + .byte 40, 40, 40, 40, 44, 44, 44, 44 + .byte 48, 48, 48, 48, 52, 52, 52, 52 + .byte 56, 56, 56, 56, 60, 60, 60, 60 + .byte 64, 64, 64, 64, 68, 68, 68, 68 + .byte 72, 72, 72, 72, 76, 76, 76, 76 + .byte 80, 80, 80, 80, 84, 84, 84, 84 + .byte 88, 88, 88, 88, 92, 92, 92, 92 + .byte 96, 96, 96, 96, 100, 100, 100, 100 + .byte 104, 104, 104, 104, 108, 108, 108, 108 + .byte 112, 112, 112, 112, 116, 116, 116, 116 + .byte 120, 120, 120, 120, 124, 124, 124, 124 + .byte 128, 128, 128, 128, 132, 132, 132, 132 + .byte 136, 136, 136, 136, 140, 140, 140, 140 + .byte 144, 144, 144, 144, 148, 148, 148, 148 + .byte 152, 152, 152, 152, 156, 156, 156, 156 + .byte 160, 160, 160, 160, 164, 164, 164, 164 + .byte 168, 168, 168, 168, 172, 172, 172, 172 + .byte 176, 176, 176, 176, 180, 180, 180, 180 + .byte 184, 184, 184, 184, 188, 188, 188, 188 + .byte 192, 192, 192, 192, 196, 196, 196, 196 + .byte 200, 200, 200, 200, 204, 204, 204, 204 + .byte 208, 208, 208, 208, 212, 212, 212, 212 + .byte 216, 216, 216, 216, 220, 220, 220, 220 + .byte 224, 224, 224, 224, 228, 228, 228, 228 + .byte 232, 232, 232, 232, 236, 236, 236, 236 + .byte 240, 240, 240, 240, 244, 244, 244, 244 + .byte 248, 248, 248, 248, 252, 252, 252, 252 + + ! 5 numbers for initil/final permutation + + .word 0x0f0f0f0f ! offset 256 + .word 0x0000ffff ! 260 + .word 0x33333333 ! 264 + .word 0x00ff00ff ! 268 + .word 0x55555555 ! 272 + + .word 0 ! 276 + .word LOOPS ! 280 + .word 0x0000FC00 ! 284 + + +.section ".text" + + .align 32 + +.des_enc: + + ! key address in3 + ! loads key next encryption/decryption first round from [in4] + + rounds_macro(in5, out5, 1, .des_enc.1, in3, in4, retl) + + + .align 32 + +.des_dec: + + ! implemented with out5 as first parameter to avoid + ! register exchange in ede modes + + ! key address in4 + ! loads key next encryption/decryption first round from [in3] + + rounds_macro(out5, in5, -1, .des_dec.1, in4, in3, retl) + + + +! void des_encrypt(data, ks, enc) +! ******************************* + + .align 32 + .global des_encrypt + .type des_encrypt,#function + +des_encrypt: + + save %sp, -96, %sp + + ld [in0], in5 ! left + sethi %hi(.des_and), out2 ! address constants + cmp in2, 0 ! enc + + ld [in0+4], out5 ! right + be,pn %icc, .encrypt.dec ! enc/dec + or out2, %lo(.des_and), out2 ! address constants + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for move in1 to in3 + ! parameter 8 1 for move in3 to in4, 2 for move in4 to in3 + + ip_macro(in5, out5, in5, out5, in3, 0, 1, 1) + + rounds_macro(in5, out5, 1, .des_encrypt.1, in3, in4) ! in4 not used + + fp_macro(in5, out5, 1) ! 1 for store to [in0] + + return in7+8 + nop + +.encrypt.dec: + + add in1, 120, in3 ! use last subkey for first round + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for move in1 to in3 + ! parameter 8 1 for move in3 to in4, 2 for move in4 to in3 + + ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include dec, ks in4 + + fp_macro(out5, in5, 1) ! 1 for store to [in0] + + return in7+8 + nop + +.des_encrypt.end: + .size des_encrypt,.des_encrypt.end-des_encrypt + + +! void des_encrypt2(data, ks, enc) +!********************************* + + ! encrypts/decrypts without initial/final permutation + + .align 32 + .global des_encrypt2 + .type des_encrypt2,#function + +des_encrypt2: + + save %sp, -112, %sp + + ! Set sbox address 1 to 6 and rotate halfs 3 left + ! Errors caught by destest? Yes. Still? *NO* + + sethi %hi(des_SPtrans), global1 ! address sbox 1 + sethi %hi(.des_and), out2 ! address constants + + or global1, %lo(des_SPtrans), global1 ! sbox 1 + or out2, %lo(.des_and), out2 ! adress constants + + add global1, 256, global2 ! sbox 2 + add global1, 512, global3 ! sbox 3 + + ld [in0], out5 ! right + add global1, 768, global4 ! sbox 4 + add global1, 1024, global5 ! sbox 5 + + ld [in0+4], in5 ! left + add global1, 1280, local6 ! sbox 6 + add global1, 1792, out3 ! sbox 8 + + ! rotate + + sll in5, 3, local5 + mov in1, in3 ! key address to in3 + + sll out5, 3, local7 + srl in5, 29, in5 + + srl out5, 29, out5 + add in5, local5, in5 + + add out5, local7, out5 + cmp in2, 0 + + ! we use our own stackframe + + be,pn %icc, .encrypt2.dec ! decryption + st in0, [%sp+68] + + ld [in3], out0 ! key 7531 first round + mov LOOPS, out4 ! loop counter + + ld [in3+4], out1 ! key 8642 first round + sethi %hi(0x0000FC00), local5 + + call .des_enc + mov in3, in4 + + ! rotate + sll in5, 29, in0 + srl in5, 3, in5 + sll out5, 29, in1 + add in5, in0, in5 + srl out5, 3, out5 + ld [%sp+68], in0 + add out5, in1, out5 + st in5, [in0] + st out5, [in0+4] + + return in7+8 + nop + + +.encrypt2.dec: + + add in3, 120, in4 + + ld [in4], out0 ! key 7531 first round + mov LOOPS, out4 ! loop counter + + ld [in4+4], out1 ! key 8642 first round + sethi %hi(0x0000FC00), local5 + + mov in5, local1 ! left expected in out5 + mov out5, in5 + + call .des_dec + mov local1, out5 + +.encrypt2.finish: + + ! rotate + sll in5, 29, in0 + srl in5, 3, in5 + sll out5, 29, in1 + add in5, in0, in5 + srl out5, 3, out5 + ld [%sp+68], in0 + add out5, in1, out5 + st out5, [in0] + st in5, [in0+4] + + return in7+8 + nop + +.des_encrypt2.end: + .size des_encrypt2, .des_encrypt2.end-des_encrypt2 + + +! void des_encrypt3(data, ks1, ks2, ks3) +! ************************************** + + .align 32 + .global des_encrypt3 + .type des_encrypt3,#function + +des_encrypt3: + + save %sp, -96, %sp + + ld [in0], in5 ! left + add in2, 120, in4 ! ks2 + sethi %hi(.des_and), out2 ! address constants + + ld [in0+4], out5 ! right + mov in3, in2 ! save ks3 + or out2, %lo(.des_and), out2 ! address constants + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for mov in1 to in3 + ! parameter 8 1 for mov in3 to in4 + ! parameter 9 1 for load ks3 and ks2 to in4 and in3 + + ip_macro(in5, out5, in5, out5, in3, 1, 1, 0, 0) + + call .des_dec + mov in2, in3 ! preload ks3 + + call .des_enc + nop + + fp_macro(in5, out5, 1) + + return in7+8 + nop + +.des_encrypt3.end: + .size des_encrypt3,.des_encrypt3.end-des_encrypt3 + + +! void des_decrypt3(data, ks1, ks2, ks3) +! ************************************** + + .align 32 + .global des_decrypt3 + .type des_decrypt3,#function + +des_decrypt3: + + save %sp, -96, %sp + + ld [in0], in5 ! left + add in3, 120, in4 ! ks3 + sethi %hi(.des_and), out2 + + ld [in0+4], out5 ! right + mov in2, in3 ! ks2 + or out2, %lo(.des_and), out2 + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for mov in1 to in3 + ! parameter 8 1 for mov in3 to in4 + ! parameter 9 1 for load ks3 and ks2 to in4 and in3 + + ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 0) + + call .des_enc + add in1, 120, in4 ! preload ks1 + + call .des_dec + nop + + fp_macro(out5, in5, 1) + + return in7+8 + nop + +.des_decrypt3.end: + .size des_decrypt3,.des_decrypt3.end-des_decrypt3 + + + +! void des_ncbc_encrypt(input, output, length, schedule, ivec, enc) +! ***************************************************************** + + + .align 32 + .global des_ncbc_encrypt + .type des_ncbc_encrypt,#function + +des_ncbc_encrypt: + + save %sp, -96, %sp + + define({INPUT}, { [%sp+68] }) + define({OUTPUT}, { [%sp+72] }) + define({IVEC}, { [%sp+84] }) + + cmp in5, 0 ! enc + + sethi %hi(.des_and), out2 ! address constants + be,pn %icc, .ncbc.dec + st in4, IVEC + + ! addr left right temp label + load_little_endian(in4, in5, out5, local3, .LLE1) ! iv + + addcc in2, -8, in2 ! bytes missing when first block done + + mov in3, in4 ! schedule + bl,pn %icc, .ncbc.enc.seven.or.less + or out2, %lo(.des_and), out2 + +.ncbc.enc.next.block: + + load_little_endian(in0, out4, global4, local3, .LLE2) ! block + +.ncbc.enc.next.block_1: + + xor in5, out4, in5 ! iv xor + xor out5, global4, out5 ! iv xor + + ! parameter 8 1 for move in3 to in4, 2 for move in4 to in3 + ip_macro(in5, out5, in5, out5, in3, 0, 0, 2) + +.ncbc.enc.next.block_2: + +!// call .des_enc ! compares in2 to 8 +! rounds inlined for alignment purposes + + add global1, 768, global4 ! address sbox 4 since register used below + + rounds_macro(in5, out5, 1, .ncbc.enc.1, in3, in4) ! include encryption ks in3 + + bl,pn %icc, .ncbc.enc.next.block_fp + add in0, 8, in0 ! input address + + ! If 8 or more bytes are to be encrypted after this block, + ! we combine final permutation for this block with initial + ! permutation for next block. Load next block: + + load_little_endian(in0, global3, global4, local5, .LLE12) + + ! parameter 1 original left + ! parameter 2 original right + ! parameter 3 left ip + ! parameter 4 right ip + ! parameter 5 1: load ks1/ks2 to in3/in4, add 120 to in4 + ! 2: mov in4 to in3 + ! + ! also adds -8 to length in2 and loads loop counter to out4 + + fp_ip_macro(out0, out1, global3, global4, 2) + + store_little_endian(in1, out0, out1, local3, .SLE10) ! block + + ld [in3], out0 ! key 7531 first round next block + mov in5, local1 + xor global3, out5, in5 ! iv xor next block + + ld [in3+4], out1 ! key 8642 + add global1, 512, global3 ! address sbox 3 since register used + xor global4, local1, out5 ! iv xor next block + + ba,pt %icc, .ncbc.enc.next.block_2 + add in1, 8, in1 ! output adress + +.ncbc.enc.next.block_fp: + + fp_macro(in5, out5) + + store_little_endian(in1, in5, out5, local3, .SLE1) ! block + + addcc in2, -8, in2 ! bytes missing when next block done + + bpos,pt %icc, .ncbc.enc.next.block ! also jumps if 0 + add in1, 8, in1 + +.ncbc.enc.seven.or.less: + + cmp in2, -8 + + ble,pt %icc, .ncbc.enc.finish + nop + + add in2, 8, local1 ! bytes to load + + ! addr, length, dest left, dest right, temp, temp2, label, ret label + load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB1, .ncbc.enc.next.block_1) + + ! Loads 1 to 7 bytes little endian to global4, out4 + + +.ncbc.enc.finish: + + ld IVEC, local4 + store_little_endian(local4, in5, out5, local5, .SLE2) ! ivec + + return in7+8 + nop + + +.ncbc.dec: + + st in0, INPUT + cmp in2, 0 ! length + add in3, 120, in3 + + ld IVEC, local7 ! ivec + ble,pn %icc, .ncbc.dec.finish + mov in3, in4 ! schedule + + st in1, OUTPUT + or out2, %lo(.des_and), out2 ! address constants low part + mov in0, local5 ! input + + load_little_endian(local7, in0, in1, local3, .LLE3) ! ivec + +.ncbc.dec.next.block: + + load_little_endian(local5, in5, out5, local3, .LLE4) ! block + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for mov in1 to in3 + ! parameter 8 1 for mov in3 to in4 + + ip_macro(in5, out5, out5, in5, in4, 2, 0, 1) ! include decryprion ks in4 + + fp_macro(out5, in5, 0, 1) ! 1 for input and output address to local5/7 + + ! in2 is bytes left to be stored + ! in2 is compared to 8 in the rounds + + xor out5, in0, out4 ! iv xor + bl,pn %icc, .ncbc.dec.seven.or.less + xor in5, in1, global4 ! iv xor + + ! Load ivec next block now, since input and output address might be the same. + + load_little_endian_inc(local5, in0, in1, local3, .LLE5) ! iv + + store_little_endian(local7, out4, global4, local3, .SLE3) + + st local5, INPUT + add local7, 8, local7 + addcc in2, -8, in2 + + bg,pt %icc, .ncbc.dec.next.block + st local7, OUTPUT + + +.ncbc.dec.store.iv: + + ld IVEC, local4 ! ivec + store_little_endian(local4, in0, in1, local5, .SLE4) + +.ncbc.dec.finish: + + return in7+8 + nop + +.ncbc.dec.seven.or.less: + + load_little_endian_inc(local5, in0, in1, local3, .LLE13) ! ivec + + store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB1, .ncbc.dec.store.iv) + + +.des_ncbc_encrypt.end: + .size des_ncbc_encrypt, .des_ncbc_encrypt.end-des_ncbc_encrypt + + +! void des_ede3_cbc_encrypt(input, output, lenght, ks1, ks2, ks3, ivec, enc) +! ************************************************************************** + + + .align 32 + .global des_ede3_cbc_encrypt + .type des_ede3_cbc_encrypt,#function + +des_ede3_cbc_encrypt: + + save %sp, -96, %sp + + define({LENGTH},{ [%sp+76] }) + define({KS1}, { [%sp+80] }) + define({KS2}, { [%sp+84] }) + define({KS3}, { [%sp+88] }) + + ld [%fp+96], local3 ! enc + sethi %hi(.des_and), out2 + + ld [%fp+92], local4 ! ivec + or out2, %lo(.des_and), out2 + cmp local3, 0 ! enc + + be,pn %icc, .ede3.dec + st in4, KS2 + + st in5, KS3 + + load_little_endian(local4, in5, out5, local3, .LLE6) ! ivec + + addcc in2, -8, in2 ! bytes missing after next block + + bl,pn %icc, .ede3.enc.seven.or.less + st in3, KS1 + +.ede3.enc.next.block: + + load_little_endian(in0, out4, global4, local3, .LLE7) + +.ede3.enc.next.block_1: + + ld KS2, in4 + xor in5, out4, in5 ! iv xor + xor out5, global4, out5 ! iv xor + + ld KS1, in3 + add in4, 120, in4 ! for decryption we use last subkey first + nop + + ip_macro(in5, out5, in5, out5, in3) + +.ede3.enc.next.block_2: + + call .des_enc ! ks1 in3 + nop + + call .des_dec ! ks2 in4 + ld KS3, in3 + + call .des_enc ! ks3 in3 compares in2 to 8 + nop + + bl,pn %icc, .ede3.enc.next.block_fp + add in0, 8, in0 + + ! If 8 or more bytes are to be encrypted after this block, + ! we combine final permutation for this block with initial + ! permutation for next block. Load next block: + + load_little_endian(in0, global3, global4, local5, .LLE11) + + ! parameter 1 original left + ! parameter 2 original right + ! parameter 3 left ip + ! parameter 4 right ip + ! parameter 5 1: load ks1/ks2 to in3/in4, add 120 to in4 + ! 2: mov in4 to in3 + ! + ! also adds -8 to length in2 and loads loop counter to out4 + + fp_ip_macro(out0, out1, global3, global4, 1) + + store_little_endian(in1, out0, out1, local3, .SLE9) ! block + + mov in5, local1 + xor global3, out5, in5 ! iv xor next block + + ld [in3], out0 ! key 7531 + add global1, 512, global3 ! address sbox 3 + xor global4, local1, out5 ! iv xor next block + + ld [in3+4], out1 ! key 8642 + add global1, 768, global4 ! address sbox 4 + ba,pt %icc, .ede3.enc.next.block_2 + add in1, 8, in1 + +.ede3.enc.next.block_fp: + + fp_macro(in5, out5) + + store_little_endian(in1, in5, out5, local3, .SLE5) ! block + + addcc in2, -8, in2 ! bytes missing when next block done + + bpos,pt %icc, .ede3.enc.next.block + add in1, 8, in1 + +.ede3.enc.seven.or.less: + + cmp in2, -8 + + ble,pt %icc, .ede3.enc.finish + nop + + add in2, 8, local1 ! bytes to load + + ! addr, length, dest left, dest right, temp, temp2, label, ret label + load_n_bytes(in0, local1, global4, out4, local2, local3, .LNB2, .ede3.enc.next.block_1) + +.ede3.enc.finish: + + ld [%fp+92], local4 ! ivec + store_little_endian(local4, in5, out5, local5, .SLE6) ! ivec + + return in7+8 + nop + + +.ede3.dec: + + st in0, INPUT + add in5, 120, in5 + + st in1, OUTPUT + mov in0, local5 + add in3, 120, in3 + + st in3, KS1 + cmp in2, 0 + + ble %icc, .ede3.dec.finish + st in5, KS3 + + ld [%fp+92], local7 ! iv + load_little_endian(local7, in0, in1, local3, .LLE8) + +.ede3.dec.next.block: + + load_little_endian(local5, in5, out5, local3, .LLE9) + + ! parameter 6 1/2 for include encryption/decryption + ! parameter 7 1 for mov in1 to in3 + ! parameter 8 1 for mov in3 to in4 + ! parameter 9 1 for load ks3 and ks2 to in4 and in3 + + ip_macro(in5, out5, out5, in5, in4, 2, 0, 0, 1) ! inc .des_dec ks3 in4 + + call .des_enc ! ks2 in3 + ld KS1, in4 + + call .des_dec ! ks1 in4 + nop + + fp_macro(out5, in5, 0, 1) ! 1 for input and output address local5/7 + + ! in2 is bytes left to be stored + ! in2 is compared to 8 in the rounds + + xor out5, in0, out4 + bl,pn %icc, .ede3.dec.seven.or.less + xor in5, in1, global4 + + load_little_endian_inc(local5, in0, in1, local3, .LLE10) ! iv next block + + store_little_endian(local7, out4, global4, local3, .SLE7) ! block + + st local5, INPUT + addcc in2, -8, in2 + add local7, 8, local7 + + bg,pt %icc, .ede3.dec.next.block + st local7, OUTPUT + +.ede3.dec.store.iv: + + ld [%fp+92], local4 ! ivec + store_little_endian(local4, in0, in1, local5, .SLE8) ! ivec + +.ede3.dec.finish: + + return in7+8 + nop + + +.ede3.dec.seven.or.less: + + load_little_endian_inc(local5, in0, in1, local3, .LLE14) ! iv + + store_n_bytes(local7, in2, global4, out4, local3, local4, .SNB2, .ede3.dec.store.iv) + + +.des_ede3_cbc_encrypt.end: + .size des_ede3_cbc_encrypt,.des_ede3_cbc_encrypt.end-des_ede3_cbc_encrypt