#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# [Endian-neutral] AES for C64x+.
#
# Even though SPLOOPs are scheduled for 13 cycles, and thus expected
# performance is ~8.5 cycles per byte processed with 128-bit key,
# measured performance turned to be ~10 cycles per byte. Discrepancy
# must be caused by limitations of L1D memory banking(*), see SPRU871
# TI publication for further details. If any consolation it's still
# ~20% faster than TI's linear assembly module anyway... Compared to
# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this
# code is 3.75x faster and almost 3x smaller (tables included).
#
# (*)	This means that there might be subtle correlation between data
#	and timing and one can wonder if it can be ... attacked:-(
#	On the other hand this also means that *if* one chooses to
#	implement *4* T-tables variant [instead of 1 T-table as in
#	this implementation, or in addition to], then one ought to
#	*interleave* them. Even though it complicates addressing,
#	references to interleaved tables would be guaranteed not to
#	clash. I reckon that it should be possible to break 8 cycles
#	per byte "barrier," i.e. improve by ~20%, naturally at the
#	cost of 8x increased pressure on L1D. 8x because you'd have
#	to interleave both Te and Td tables...

while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";

($TEA,$TEB)=("A5","B5");
($KPA,$KPB)=("A3","B1");
@K=("A6","B6","A7","B7");
@s=("A8","B8","A9","B9");
@Te0=@Td0=("A16","B16","A17","B17");
@Te1=@Td1=("A18","B18","A19","B19");
@Te2=@Td2=("A20","B20","A21","B21");
@Te3=@Td3=("A22","B22","A23","B23");

$code=<<___;
	.text

	.if	.ASSEMBLER_VERSION<7000000
	.asg	0,__TI_EABI__
	.endif
	.if	__TI_EABI__
	.nocmp
	.asg	AES_encrypt,_AES_encrypt
	.asg	AES_decrypt,_AES_decrypt
	.asg	AES_set_encrypt_key,_AES_set_encrypt_key
	.asg	AES_set_decrypt_key,_AES_set_decrypt_key
	.asg	AES_ctr32_encrypt,_AES_ctr32_encrypt
	.endif

	.asg	B3,RA
	.asg	A4,INP
	.asg	B4,OUT
	.asg	A6,KEY
	.asg	A4,RET
	.asg	B15,SP

	.eval	24,EXT0
	.eval	16,EXT1
	.eval	8,EXT2
	.eval	0,EXT3
	.eval	8,TBL1
	.eval	16,TBL2
	.eval	24,TBL3

	.if	.BIG_ENDIAN
	.eval	24-EXT0,EXT0
	.eval	24-EXT1,EXT1
	.eval	24-EXT2,EXT2
	.eval	24-EXT3,EXT3
	.eval	32-TBL1,TBL1
	.eval	32-TBL2,TBL2
	.eval	32-TBL3,TBL3
	.endif

	.global	_AES_encrypt
_AES_encrypt:
	.asmfunc
	MVK	1,B2
__encrypt:
	.if	__TI_EABI__
   [B2]	LDNDW	*INP++,A9:A8			; load input
||	MVKL	\$PCR_OFFSET(AES_Te,__encrypt),$TEA
||	ADDKPC	__encrypt,B0
   [B2]	LDNDW	*INP++,B9:B8
||	MVKH	\$PCR_OFFSET(AES_Te,__encrypt),$TEA
||	ADD	0,KEY,$KPA
||	ADD	4,KEY,$KPB
	.else
   [B2]	LDNDW	*INP++,A9:A8			; load input
||	MVKL	(AES_Te-__encrypt),$TEA
||	ADDKPC	__encrypt,B0
   [B2]	LDNDW	*INP++,B9:B8
||	MVKH	(AES_Te-__encrypt),$TEA
||	ADD	0,KEY,$KPA
||	ADD	4,KEY,$KPB
	.endif
	LDW	*$KPA++[2],$Te0[0]		; zero round key
||	LDW	*$KPB++[2],$Te0[1]
||	MVK	60,A0
||	ADD	B0,$TEA,$TEA			; AES_Te
	LDW	*KEY[A0],B0			; rounds
||	MVK	1024,A0				; sizeof(AES_Te)
	LDW	*$KPA++[2],$Te0[2]
||	LDW	*$KPB++[2],$Te0[3]
||	MV	$TEA,$TEB
	NOP
	.if	.BIG_ENDIAN
	MV	A9,$s[0]
||	MV	A8,$s[1]
||	MV	B9,$s[2]
||	MV	B8,$s[3]
	.else
	MV	A8,$s[0]
||	MV	A9,$s[1]
||	MV	B8,$s[2]
||	MV	B9,$s[3]
	.endif
	XOR	$Te0[0],$s[0],$s[0]
||	XOR	$Te0[1],$s[1],$s[1]
||	LDW	*$KPA++[2],$K[0]		; 1st round key
||	LDW	*$KPB++[2],$K[1]
	SUB	B0,2,B0

	SPLOOPD	13
||	MVC	B0,ILC
||	LDW	*$KPA++[2],$K[2]
||	LDW	*$KPB++[2],$K[3]
;;====================================================================
	EXTU	$s[1],EXT1,24,$Te1[1]
||	EXTU	$s[0],EXT3,24,$Te3[0]
	LDW	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
||	LDW	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
||	XOR	$s[3],$Te0[3],$s[3]		; modulo-scheduled
||	EXTU	$s[1],EXT3,24,$Te3[1]
||	EXTU	$s[0],EXT1,24,$Te1[0]
	LDW	*${TEB}[$Te3[1]],$Te3[1]	; Te3[s1>>24],	t2
||	LDW	*${TEA}[$Te1[0]],$Te1[0]	; Te1[s0>>8],	t3
||	EXTU	$s[2],EXT2,24,$Te2[2]
||	EXTU	$s[3],EXT2,24,$Te2[3]
	LDW	*${TEA}[$Te2[2]],$Te2[2]	; Te2[s2>>16],	t0
||	LDW	*${TEB}[$Te2[3]],$Te2[3]	; Te2[s3>>16],	t1
||	EXTU	$s[3],EXT3,24,$Te3[3]
||	EXTU	$s[2],EXT1,24,$Te1[2]
	LDW	*${TEB}[$Te3[3]],$Te3[3]	; Te3[s3>>24],	t0
||	LDW	*${TEA}[$Te1[2]],$Te1[2]	; Te1[s2>>8],	t1
||	EXTU	$s[0],EXT2,24,$Te2[0]
||	EXTU	$s[1],EXT2,24,$Te2[1]
	LDW	*${TEA}[$Te2[0]],$Te2[0]	; Te2[s0>>16],	t2
||	LDW	*${TEB}[$Te2[1]],$Te2[1]	; Te2[s1>>16],	t3
||	EXTU	$s[3],EXT1,24,$Te1[3]
||	EXTU	$s[2],EXT3,24,$Te3[2]
	LDW	*${TEB}[$Te1[3]],$Te1[3]	; Te1[s3>>8],	t2
||	LDW	*${TEA}[$Te3[2]],$Te3[2]	; Te3[s2>>24],	t3
||	ROTL	$Te1[1],TBL1,$Te3[0]		; t0
||	ROTL	$Te3[0],TBL3,$Te1[1]		; t1
||	EXTU	$s[0],EXT0,24,$Te0[0]
||	EXTU	$s[1],EXT0,24,$Te0[1]
	LDW	*${TEA}[$Te0[0]],$Te0[0]	; Te0[s0],	t0
||	LDW	*${TEB}[$Te0[1]],$Te0[1]	; Te0[s1],	t1
||	ROTL	$Te3[1],TBL3,$Te1[0]		; t2
||	ROTL	$Te1[0],TBL1,$Te3[1]		; t3
||	EXTU	$s[2],EXT0,24,$Te0[2]
||	EXTU	$s[3],EXT0,24,$Te0[3]
	LDW	*${TEA}[$Te0[2]],$Te0[2]	; Te0[s2],	t2
||	LDW	*${TEB}[$Te0[3]],$Te0[3]	; Te0[s3],	t3
||	ROTL	$Te2[2],TBL2,$Te2[2]		; t0
||	ROTL	$Te2[3],TBL2,$Te2[3]		; t1
||	XOR	$K[0],$Te3[0],$s[0]
||	XOR	$K[1],$Te1[1],$s[1]
	ROTL	$Te3[3],TBL3,$Te1[2]		; t0
||	ROTL	$Te1[2],TBL1,$Te3[3]		; t1
||	XOR	$K[2],$Te1[0],$s[2]
||	XOR	$K[3],$Te3[1],$s[3]
||	LDW	*$KPA++[2],$K[0]		; next round key
||	LDW	*$KPB++[2],$K[1]
	ROTL	$Te2[0],TBL2,$Te2[0]		; t2
||	ROTL	$Te2[1],TBL2,$Te2[1]		; t3
||	XOR	$s[0],$Te2[2],$s[0]
||	XOR	$s[1],$Te2[3],$s[1]
||	LDW	*$KPA++[2],$K[2]
||	LDW	*$KPB++[2],$K[3]
	ROTL	$Te1[3],TBL1,$Te3[2]		; t2
||	ROTL	$Te3[2],TBL3,$Te1[3]		; t3
||	XOR	$s[0],$Te1[2],$s[0]
||	XOR	$s[1],$Te3[3],$s[1]
	XOR	$s[2],$Te2[0],$s[2]
||	XOR	$s[3],$Te2[1],$s[3]
||	XOR	$s[0],$Te0[0],$s[0]
||	XOR	$s[1],$Te0[1],$s[1]
	SPKERNEL
||	XOR.L	$s[2],$Te3[2],$s[2]
||	XOR.L	$s[3],$Te1[3],$s[3]
;;====================================================================
	ADD.D	${TEA},A0,${TEA}		; point to Te4
||	ADD.D	${TEB},A0,${TEB}
||	EXTU	$s[1],EXT1,24,$Te1[1]
||	EXTU	$s[0],EXT3,24,$Te3[0]
	LDBU	*${TEB}[$Te1[1]],$Te1[1]	; Te1[s1>>8],	t0
||	LDBU	*${TEA}[$Te3[0]],$Te3[0]	; Te3[s0>>24],	t1
||	XOR	$s[2],$Te0[2],$s[2]		; modulo-scheduled
||	XOR	$s[3],$Te0[3],$s[3]		; modulo-scheduled
||	EXTU	$s[0],EXT0,24,$Te0[0]
||	EXTU	$s[1],EXT0,24,$Te0[1]
	LDBU	*${TEA}[$Te0[0]],$Te0[0]	; Te0[s0],	t0
||	LDBU	*${TEB}[$Te0[1]],$Te0[1]	; Te0[s1],	t1
||	EXTU	$s[3],EXT3,24,$Te3[3]
||	EXTU	$s[2],EXT1,24,$Te1[2]
	LDBU	*${TEB}[$Te3[3]],$Te3[3]	; Te3[s3>>24],	t0
||	LDBU	*${TEA}[$Te1[2]],$Te1[2]	; Te1[s2>>8],	t1
||	EXTU	$s[2],EXT2,24,$Te2[2]
||	EXTU	$s[3],EXT2,24,$Te2[3]
	LDBU	*${TEA}[$Te2[2]],$Te2[2]	; Te2[s2>>16],	t0
||	LDBU	*${TEB}[$Te2[3]],$Te2[3]	; Te2[s3>>16],	t1
||	EXTU	$s[1],EXT3,24,$Te3[1]
||	EXTU	$s[0],EXT1,24,$Te1[0]
	LDBU	*${TEB}[$Te3[1]],$Te3[1]	; Te3[s1>>24],	t2
||	LDBU	*${TEA}[$Te1[0]],$Te1[0]	; Te1[s0>>8],	t3
||	EXTU	$s[3],EXT1,24,$Te1[3]
||	EXTU	$s[2],EXT3,24,$Te3[2]
	LDBU	*${TEB}[$Te1[3]],$Te1[3]	; Te1[s3>>8],	t2
||	LDBU	*${TEA}[$Te3[2]],$Te3[2]	; Te3[s2>>24],	t3
||	EXTU	$s[2],EXT0,24,$Te0[2]
||	EXTU	$s[3],EXT0,24,$Te0[3]
	LDBU	*${TEA}[$Te0[2]],$Te0[2]	; Te0[s2],	t2
||	LDBU	*${TEB}[$Te0[3]],$Te0[3]	; Te0[s3],	t3
||	EXTU	$s[0],EXT2,24,$Te2[0]
||	EXTU	$s[1],EXT2,24,$Te2[1]
	LDBU	*${TEA}[$Te2[0]],$Te2[0]	; Te2[s0>>16],	t2
||	LDBU	*${TEB}[$Te2[1]],$Te2[1]	; Te2[s1>>16],	t3

	.if	.BIG_ENDIAN
	PACK2	$Te0[0],$Te1[1],$Te0[0]
||	PACK2	$Te0[1],$Te1[2],$Te0[1]
	PACK2	$Te2[2],$Te3[3],$Te2[2]
||	PACK2	$Te2[3],$Te3[0],$Te2[3]
	PACKL4	$Te0[0],$Te2[2],$Te0[0]
||	PACKL4	$Te0[1],$Te2[3],$Te0[1]
	XOR	$K[0],$Te0[0],$Te0[0]		; s[0]
||	XOR	$K[1],$Te0[1],$Te0[1]		; s[1]

	PACK2	$Te0[2],$Te1[3],$Te0[2]
||	PACK2	$Te0[3],$Te1[0],$Te0[3]
	PACK2	$Te2[0],$Te3[1],$Te2[0]
||	PACK2	$Te2[1],$Te3[2],$Te2[1]
||	BNOP	RA
	PACKL4	$Te0[2],$Te2[0],$Te0[2]
||	PACKL4	$Te0[3],$Te2[1],$Te0[3]
	XOR	$K[2],$Te0[2],$Te0[2]		; s[2]
||	XOR	$K[3],$Te0[3],$Te0[3]		; s[3]

	MV	$Te0[0],A9
||	MV	$Te0[1],A8
	MV	$Te0[2],B9
||	MV	$Te0[3],B8
|| [B2]	STNDW	A9:A8,*OUT++
   [B2]	STNDW	B9:B8,*OUT++
	.else
	PACK2	$Te1[1],$Te0[0],$Te1[1]
||	PACK2	$Te1[2],$Te0[1],$Te1[2]
	PACK2	$Te3[3],$Te2[2],$Te3[3]
||	PACK2	$Te3[0],$Te2[3],$Te3[0]
	PACKL4	$Te3[3],$Te1[1],$Te1[1]
||	PACKL4	$Te3[0],$Te1[2],$Te1[2]
	XOR	$K[0],$Te1[1],$Te1[1]		; s[0]
||	XOR	$K[1],$Te1[2],$Te1[2]		; s[1]

	PACK2	$Te1[3],$Te0[2],$Te1[3]
||	PACK2	$Te1[0],$Te0[3],$Te1[0]
	PACK2	$Te3[1],$Te2[0],$Te3[1]
||	PACK2	$Te3[2],$Te2[1],$Te3[2]
||	BNOP	RA
	PACKL4	$Te3[1],$Te1[3],$Te1[3]
||	PACKL4	$Te3[2],$Te1[0],$Te1[0]
	XOR	$K[2],$Te1[3],$Te1[3]		; s[2]
||	XOR	$K[3],$Te1[0],$Te1[0]		; s[3]

	MV	$Te1[1],A8
||	MV	$Te1[2],A9
	MV	$Te1[3],B8
||	MV	$Te1[0],B9
|| [B2]	STNDW	A9:A8,*OUT++
   [B2]	STNDW	B9:B8,*OUT++
	.endif
	.endasmfunc

	.global	_AES_decrypt
_AES_decrypt:
	.asmfunc
	MVK	1,B2
__decrypt:
	.if	__TI_EABI__
   [B2]	LDNDW	*INP++,A9:A8			; load input
||	MVKL	\$PCR_OFFSET(AES_Td,__decrypt),$TEA
||	ADDKPC	__decrypt,B0
   [B2]	LDNDW	*INP++,B9:B8
||	MVKH	\$PCR_OFFSET(AES_Td,__decrypt),$TEA
||	ADD	0,KEY,$KPA
||	ADD	4,KEY,$KPB
	.else
   [B2]	LDNDW	*INP++,A9:A8			; load input
||	MVKL	(AES_Td-__decrypt),$TEA
||	ADDKPC	__decrypt,B0
   [B2]	LDNDW	*INP++,B9:B8
||	MVKH	(AES_Td-__decrypt),$TEA
||	ADD	0,KEY,$KPA
||	ADD	4,KEY,$KPB
	.endif
	LDW	*$KPA++[2],$Td0[0]		; zero round key
||	LDW	*$KPB++[2],$Td0[1]
||	MVK	60,A0
||	ADD	B0,$TEA,$TEA			; AES_Td
	LDW	*KEY[A0],B0			; rounds
||	MVK	1024,A0				; sizeof(AES_Td)
	LDW	*$KPA++[2],$Td0[2]
||	LDW	*$KPB++[2],$Td0[3]
||	MV	$TEA,$TEB
	NOP
	.if	.BIG_ENDIAN
	MV	A9,$s[0]
||	MV	A8,$s[1]
||	MV	B9,$s[2]
||	MV	B8,$s[3]
	.else
	MV	A8,$s[0]
||	MV	A9,$s[1]
||	MV	B8,$s[2]
||	MV	B9,$s[3]
	.endif
	XOR	$Td0[0],$s[0],$s[0]
||	XOR	$Td0[1],$s[1],$s[1]
||	LDW	*$KPA++[2],$K[0]		; 1st round key
||	LDW	*$KPB++[2],$K[1]
	SUB	B0,2,B0

	SPLOOPD	13
||	MVC	B0,ILC
||	LDW	*$KPA++[2],$K[2]
||	LDW	*$KPB++[2],$K[3]
;;====================================================================
	EXTU	$s[1],EXT3,24,$Td3[1]
||	EXTU	$s[0],EXT1,24,$Td1[0]
	LDW	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
||	LDW	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
||	XOR	$s[3],$Td0[3],$s[3]		; modulo-scheduled
||	EXTU	$s[1],EXT1,24,$Td1[1]
||	EXTU	$s[0],EXT3,24,$Td3[0]
	LDW	*${TEB}[$Td1[1]],$Td1[1]	; Td1[s1>>8],	t2
||	LDW	*${TEA}[$Td3[0]],$Td3[0]	; Td3[s0>>24],	t3
||	EXTU	$s[2],EXT2,24,$Td2[2]
||	EXTU	$s[3],EXT2,24,$Td2[3]
	LDW	*${TEA}[$Td2[2]],$Td2[2]	; Td2[s2>>16],	t0
||	LDW	*${TEB}[$Td2[3]],$Td2[3]	; Td2[s3>>16],	t1
||	EXTU	$s[3],EXT1,24,$Td1[3]
||	EXTU	$s[2],EXT3,24,$Td3[2]
	LDW	*${TEB}[$Td1[3]],$Td1[3]	; Td1[s3>>8],	t0
||	LDW	*${TEA}[$Td3[2]],$Td3[2]	; Td3[s2>>24],	t1
||	EXTU	$s[0],EXT2,24,$Td2[0]
||	EXTU	$s[1],EXT2,24,$Td2[1]
	LDW	*${TEA}[$Td2[0]],$Td2[0]	; Td2[s0>>16],	t2
||	LDW	*${TEB}[$Td2[1]],$Td2[1]	; Td2[s1>>16],	t3
||	EXTU	$s[3],EXT3,24,$Td3[3]
||	EXTU	$s[2],EXT1,24,$Td1[2]
	LDW	*${TEB}[$Td3[3]],$Td3[3]	; Td3[s3>>24],	t2
||	LDW	*${TEA}[$Td1[2]],$Td1[2]	; Td1[s2>>8],	t3
||	ROTL	$Td3[1],TBL3,$Td1[0]		; t0
||	ROTL	$Td1[0],TBL1,$Td3[1]		; t1
||	EXTU	$s[0],EXT0,24,$Td0[0]
||	EXTU	$s[1],EXT0,24,$Td0[1]
	LDW	*${TEA}[$Td0[0]],$Td0[0]	; Td0[s0],	t0
||	LDW	*${TEB}[$Td0[1]],$Td0[1]	; Td0[s1],	t1
||	ROTL	$Td1[1],TBL1,$Td3[0]		; t2
||	ROTL	$Td3[0],TBL3,$Td1[1]		; t3
||	EXTU	$s[2],EXT0,24,$Td0[2]
||	EXTU	$s[3],EXT0,24,$Td0[3]
	LDW	*${TEA}[$Td0[2]],$Td0[2]	; Td0[s2],	t2
||	LDW	*${TEB}[$Td0[3]],$Td0[3]	; Td0[s3],	t3
||	ROTL	$Td2[2],TBL2,$Td2[2]		; t0
||	ROTL	$Td2[3],TBL2,$Td2[3]		; t1
||	XOR	$K[0],$Td1[0],$s[0]
||	XOR	$K[1],$Td3[1],$s[1]
	ROTL	$Td1[3],TBL1,$Td3[2]		; t0
||	ROTL	$Td3[2],TBL3,$Td1[3]		; t1
||	XOR	$K[2],$Td3[0],$s[2]
||	XOR	$K[3],$Td1[1],$s[3]
||	LDW	*$KPA++[2],$K[0]		; next round key
||	LDW	*$KPB++[2],$K[1]
	ROTL	$Td2[0],TBL2,$Td2[0]		; t2
||	ROTL	$Td2[1],TBL2,$Td2[1]		; t3
||	XOR	$s[0],$Td2[2],$s[0]
||	XOR	$s[1],$Td2[3],$s[1]
||	LDW	*$KPA++[2],$K[2]
||	LDW	*$KPB++[2],$K[3]
	ROTL	$Td3[3],TBL3,$Td1[2]		; t2
||	ROTL	$Td1[2],TBL1,$Td3[3]		; t3
||	XOR	$s[0],$Td3[2],$s[0]
||	XOR	$s[1],$Td1[3],$s[1]
	XOR	$s[2],$Td2[0],$s[2]
||	XOR	$s[3],$Td2[1],$s[3]
||	XOR	$s[0],$Td0[0],$s[0]
||	XOR	$s[1],$Td0[1],$s[1]
	SPKERNEL
||	XOR.L	$s[2],$Td1[2],$s[2]
||	XOR.L	$s[3],$Td3[3],$s[3]
;;====================================================================
	ADD.D	${TEA},A0,${TEA}		; point to Td4
||	ADD.D	${TEB},A0,${TEB}
||	EXTU	$s[1],EXT3,24,$Td3[1]
||	EXTU	$s[0],EXT1,24,$Td1[0]
	LDBU	*${TEB}[$Td3[1]],$Td3[1]	; Td3[s1>>24],	t0
||	LDBU	*${TEA}[$Td1[0]],$Td1[0]	; Td1[s0>>8],	t1
||	XOR	$s[2],$Td0[2],$s[2]		; modulo-scheduled
||	XOR	$s[3],$Td0[3],$s[3]		; modulo-scheduled
||	EXTU	$s[0],EXT0,24,$Td0[0]
||	EXTU	$s[1],EXT0,24,$Td0[1]
	LDBU	*${TEA}[$Td0[0]],$Td0[0]	; Td0[s0],	t0
||	LDBU	*${TEB}[$Td0[1]],$Td0[1]	; Td0[s1],	t1
||	EXTU	$s[2],EXT2,24,$Td2[2]
||	EXTU	$s[3],EXT2,24,$Td2[3]
	LDBU	*${TEA}[$Td2[2]],$Td2[2]	; Td2[s2>>16],	t0
||	LDBU	*${TEB}[$Td2[3]],$Td2[3]	; Td2[s3>>16],	t1
||	EXTU	$s[3],EXT1,24,$Td1[3]
||	EXTU	$s[2],EXT3,24,$Td3[2]
	LDBU	*${TEB}[$Td1[3]],$Td1[3]	; Td1[s3>>8],	t0
||	LDBU	*${TEA}[$Td3[2]],$Td3[2]	; Td3[s2>>24],	t1
||	EXTU	$s[1],EXT1,24,$Td1[1]
||	EXTU	$s[0],EXT3,24,$Td3[0]
	LDBU	*${TEB}[$Td1[1]],$Td1[1]	; Td1[s1>>8],	t2
||	LDBU	*${TEA}[$Td3[0]],$Td3[0]	; Td3[s0>>24],	t3
||	EXTU	$s[0],EXT2,24,$Td2[0]
||	EXTU	$s[1],EXT2,24,$Td2[1]
	LDBU	*${TEA}[$Td2[0]],$Td2[0]	; Td2[s0>>16],	t2
||	LDBU	*${TEB}[$Td2[1]],$Td2[1]	; Td2[s1>>16],	t3
||	EXTU	$s[3],EXT3,24,$Td3[3]
||	EXTU	$s[2],EXT1,24,$Td1[2]
	LDBU	*${TEB}[$Td3[3]],$Td3[3]	; Td3[s3>>24],	t2
||	LDBU	*${TEA}[$Td1[2]],$Td1[2]	; Td1[s2>>8],	t3
||	EXTU	$s[2],EXT0,24,$Td0[2]
||	EXTU	$s[3],EXT0,24,$Td0[3]
	LDBU	*${TEA}[$Td0[2]],$Td0[2]	; Td0[s2],	t2
||	LDBU	*${TEB}[$Td0[3]],$Td0[3]	; Td0[s3],	t3

	.if	.BIG_ENDIAN
	PACK2	$Td0[0],$Td1[3],$Td0[0]
||	PACK2	$Td0[1],$Td1[0],$Td0[1]
	PACK2	$Td2[2],$Td3[1],$Td2[2]
||	PACK2	$Td2[3],$Td3[2],$Td2[3]
	PACKL4	$Td0[0],$Td2[2],$Td0[0]
||	PACKL4	$Td0[1],$Td2[3],$Td0[1]
	XOR	$K[0],$Td0[0],$Td0[0]		; s[0]
||	XOR	$K[1],$Td0[1],$Td0[1]		; s[1]

	PACK2	$Td0[2],$Td1[1],$Td0[2]
||	PACK2	$Td0[3],$Td1[2],$Td0[3]
	PACK2	$Td2[0],$Td3[3],$Td2[0]
||	PACK2	$Td2[1],$Td3[0],$Td2[1]
||	BNOP	RA
	PACKL4	$Td0[2],$Td2[0],$Td0[2]
||	PACKL4	$Td0[3],$Td2[1],$Td0[3]
	XOR	$K[2],$Td0[2],$Td0[2]		; s[2]
||	XOR	$K[3],$Td0[3],$Td0[3]		; s[3]

	MV	$Td0[0],A9
||	MV	$Td0[1],A8
	MV	$Td0[2],B9
||	MV	$Td0[3],B8
|| [B2]	STNDW	A9:A8,*OUT++
   [B2]	STNDW	B9:B8,*OUT++
	.else
	PACK2	$Td1[3],$Td0[0],$Td1[3]
||	PACK2	$Td1[0],$Td0[1],$Td1[0]
	PACK2	$Td3[1],$Td2[2],$Td3[1]
||	PACK2	$Td3[2],$Td2[3],$Td3[2]
	PACKL4	$Td3[1],$Td1[3],$Td1[3]
||	PACKL4	$Td3[2],$Td1[0],$Td1[0]
	XOR	$K[0],$Td1[3],$Td1[3]		; s[0]
||	XOR	$K[1],$Td1[0],$Td1[0]		; s[1]

	PACK2	$Td1[1],$Td0[2],$Td1[1]
||	PACK2	$Td1[2],$Td0[3],$Td1[2]
	PACK2	$Td3[3],$Td2[0],$Td3[3]
||	PACK2	$Td3[0],$Td2[1],$Td3[0]
||	BNOP	RA
	PACKL4	$Td3[3],$Td1[1],$Td1[1]
||	PACKL4	$Td3[0],$Td1[2],$Td1[2]
	XOR	$K[2],$Td1[1],$Td1[1]		; s[2]
||	XOR	$K[3],$Td1[2],$Td1[2]		; s[3]

	MV	$Td1[3],A8
||	MV	$Td1[0],A9
	MV	$Td1[1],B8
||	MV	$Td1[2],B9
|| [B2]	STNDW	A9:A8,*OUT++
   [B2]	STNDW	B9:B8,*OUT++
	.endif
	.endasmfunc
___
{
my @K=(@K,@s);			# extended key
my @Te4=map("B$_",(16..19));

my @Kx9=@Te0;			# used in AES_set_decrypt_key
my @KxB=@Te1;
my @KxD=@Te2;
my @KxE=@Te3;

$code.=<<___;
	.asg	OUT,BITS

	.global	_AES_set_encrypt_key
_AES_set_encrypt_key:
__set_encrypt_key:
	.asmfunc
	MV	INP,A0
||	SHRU	BITS,5,BITS			; 128-192-256 -> 4-6-8
||	MV	KEY,A1
  [!A0]	B	RA
||[!A0]	MVK	-1,RET
||[!A0]	MVK	1,A1				; only one B RA
  [!A1]	B	RA
||[!A1]	MVK	-1,RET
||[!A1]	MVK	0,A0
||	MVK	0,B0
||	MVK	0,A1
   [A0]	LDNDW	*INP++,A9:A8
|| [A0]	CMPEQ	4,BITS,B0
|| [A0]	CMPLT	3,BITS,A1
   [B0]	B	key128?
|| [A1]	LDNDW	*INP++,B9:B8
|| [A0]	CMPEQ	6,BITS,B0
|| [A0]	CMPLT	5,BITS,A1
   [B0]	B	key192?
|| [A1]	LDNDW	*INP++,B17:B16
|| [A0]	CMPEQ	8,BITS,B0
|| [A0]	CMPLT	7,BITS,A1
   [B0]	B	key256?
|| [A1]	LDNDW	*INP++,B19:B18

	.if	__TI_EABI__
   [A0]	ADD	0,KEY,$KPA
|| [A0]	ADD	4,KEY,$KPB
|| [A0]	MVKL	\$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
|| [A0]	ADDKPC	__set_encrypt_key,B6
   [A0]	MVKH	\$PCR_OFFSET(AES_Te4,__set_encrypt_key),$TEA
   [A0]	ADD	B6,$TEA,$TEA			; AES_Te4
	.else
   [A0]	ADD	0,KEY,$KPA
|| [A0]	ADD	4,KEY,$KPB
|| [A0]	MVKL	(AES_Te4-__set_encrypt_key),$TEA
|| [A0]	ADDKPC	__set_encrypt_key,B6
   [A0]	MVKH	(AES_Te4-__set_encrypt_key),$TEA
   [A0]	ADD	B6,$TEA,$TEA			; AES_Te4
	.endif
	NOP
	NOP

	BNOP	RA,5
||	MVK	-2,RET				; unknown bit lenght
||	MVK	0,B0				; redundant
;;====================================================================
;;====================================================================
key128?:
	.if	.BIG_ENDIAN
	MV	A9,$K[0]
||	MV	A8,$K[1]
||	MV	B9,$Te4[2]
||	MV	B8,$K[3]
	.else
	MV	A8,$K[0]
||	MV	A9,$K[1]
||	MV	B8,$Te4[2]
||	MV	B9,$K[3]
	.endif

	MVK	256,A0
||	MVK	9,B0

	SPLOOPD	14
||	MVC	B0,ILC
||	MV	$TEA,$TEB
||	ADD	$TEA,A0,A30			; rcon
;;====================================================================
	LDW	*A30++[1],A31			; rcon[i]
||	MV	$Te4[2],$K[2]
||	EXTU	$K[3],EXT1,24,$Te4[0]
	LDBU	*${TEB}[$Te4[0]],$Te4[0]
||	MV	$K[3],A0
||	EXTU	$K[3],EXT2,24,$Te4[1]
	LDBU	*${TEB}[$Te4[1]],$Te4[1]
||	EXTU	A0,EXT3,24,A0
||	EXTU	$K[3],EXT0,24,$Te4[3]
	.if	.BIG_ENDIAN
	LDBU	*${TEA}[A0],$Te4[3]
||	LDBU	*${TEB}[$Te4[3]],A0
	.else
	LDBU	*${TEA}[A0],A0
||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
	.endif

	STW	$K[0],*$KPA++[2]
||	STW	$K[1],*$KPB++[2]
	STW	$K[2],*$KPA++[2]
||	STW	$K[3],*$KPB++[2]

	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
	.if	.BIG_ENDIAN
	PACK2	$Te4[0],$Te4[1],$Te4[1]
	PACK2	$Te4[3],A0,$Te4[3]
	PACKL4	$Te4[1],$Te4[3],$Te4[3]
	.else
	PACK2	$Te4[1],$Te4[0],$Te4[1]
	PACK2	$Te4[3],A0,$Te4[3]
	PACKL4	$Te4[3],$Te4[1],$Te4[3]
	.endif
	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
	XOR	$Te4[0],$K[1],$K[1]		; K[1]
	MV	$Te4[0],$K[0]
||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
	XOR	$Te4[2],$K[3],$K[3]		; K[3]
	SPKERNEL
;;====================================================================
	BNOP	RA
	MV	$Te4[2],$K[2]
||	STW	$K[0],*$KPA++[2]
||	STW	$K[1],*$KPB++[2]
	STW	$K[2],*$KPA++[2]
||	STW	$K[3],*$KPB++[2]
	MVK	10,B0				; rounds
	STW	B0,*++${KPB}[15]
	MVK	0,RET
;;====================================================================
;;====================================================================
key192?:
	.if	.BIG_ENDIAN
	MV	A9,$K[0]
||	MV	A8,$K[1]
||	MV	B9,$K[2]
||	MV	B8,$K[3]
	MV	B17,$Te4[2]
||	MV	B16,$K[5]
	.else
	MV	A8,$K[0]
||	MV	A9,$K[1]
||	MV	B8,$K[2]
||	MV	B9,$K[3]
	MV	B16,$Te4[2]
||	MV	B17,$K[5]
	.endif

	MVK	256,A0
||	MVK	6,B0
	MV	$TEA,$TEB
||	ADD	$TEA,A0,A30			; rcon
;;====================================================================
loop192?:
	LDW	*A30++[1],A31			; rcon[i]
||	MV	$Te4[2],$K[4]
||	EXTU	$K[5],EXT1,24,$Te4[0]
	LDBU	*${TEB}[$Te4[0]],$Te4[0]
||	MV	$K[5],A0
||	EXTU	$K[5],EXT2,24,$Te4[1]
	LDBU	*${TEB}[$Te4[1]],$Te4[1]
||	EXTU	A0,EXT3,24,A0
||	EXTU	$K[5],EXT0,24,$Te4[3]
	.if	.BIG_ENDIAN
	LDBU	*${TEA}[A0],$Te4[3]
||	LDBU	*${TEB}[$Te4[3]],A0
	.else
	LDBU	*${TEA}[A0],A0
||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
	.endif

	STW	$K[0],*$KPA++[2]
||	STW	$K[1],*$KPB++[2]
	STW	$K[2],*$KPA++[2]
||	STW	$K[3],*$KPB++[2]
	STW	$K[4],*$KPA++[2]
||	STW	$K[5],*$KPB++[2]

	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
	.if	.BIG_ENDIAN
	PACK2	$Te4[0],$Te4[1],$Te4[1]
||	PACK2	$Te4[3],A0,$Te4[3]
	PACKL4	$Te4[1],$Te4[3],$Te4[3]
	.else
	PACK2	$Te4[1],$Te4[0],$Te4[1]
||	PACK2	$Te4[3],A0,$Te4[3]
	PACKL4	$Te4[3],$Te4[1],$Te4[3]
	.endif
	BDEC	loop192?,B0
||	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
	XOR	$Te4[0],$K[1],$K[1]		; K[1]
	MV	$Te4[0],$K[0]
||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
	XOR	$Te4[2],$K[3],$K[3]		; K[3]
	MV	$Te4[2],$K[2]
||	XOR	$K[3],$K[4],$Te4[2]		; K[4]
	XOR	$Te4[2],$K[5],$K[5]		; K[5]
;;====================================================================
	BNOP	RA
	STW	$K[0],*$KPA++[2]
||	STW	$K[1],*$KPB++[2]
	STW	$K[2],*$KPA++[2]
||	STW	$K[3],*$KPB++[2]
	MVK	12,B0				; rounds
	STW	B0,*++${KPB}[7]
	MVK	0,RET
;;====================================================================
;;====================================================================
key256?:
	.if	.BIG_ENDIAN
	MV	A9,$K[0]
||	MV	A8,$K[1]
||	MV	B9,$K[2]
||	MV	B8,$K[3]
	MV	B17,$K[4]
||	MV	B16,$K[5]
||	MV	B19,$Te4[2]
||	MV	B18,$K[7]
	.else
	MV	A8,$K[0]
||	MV	A9,$K[1]
||	MV	B8,$K[2]
||	MV	B9,$K[3]
	MV	B16,$K[4]
||	MV	B17,$K[5]
||	MV	B18,$Te4[2]
||	MV	B19,$K[7]
	.endif

	MVK	256,A0
||	MVK	6,B0
	MV	$TEA,$TEB
||	ADD	$TEA,A0,A30			; rcon
;;====================================================================
loop256?:
	LDW	*A30++[1],A31			; rcon[i]
||	MV	$Te4[2],$K[6]
||	EXTU	$K[7],EXT1,24,$Te4[0]
	LDBU	*${TEB}[$Te4[0]],$Te4[0]
||	MV	$K[7],A0
||	EXTU	$K[7],EXT2,24,$Te4[1]
	LDBU	*${TEB}[$Te4[1]],$Te4[1]
||	EXTU	A0,EXT3,24,A0
||	EXTU	$K[7],EXT0,24,$Te4[3]
	.if	.BIG_ENDIAN
	LDBU	*${TEA}[A0],$Te4[3]
||	LDBU	*${TEB}[$Te4[3]],A0
	.else
	LDBU	*${TEA}[A0],A0
||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
	.endif

	STW	$K[0],*$KPA++[2]
||	STW	$K[1],*$KPB++[2]
	STW	$K[2],*$KPA++[2]
||	STW	$K[3],*$KPB++[2]
	STW	$K[4],*$KPA++[2]
||	STW	$K[5],*$KPB++[2]
	STW	$K[6],*$KPA++[2]
||	STW	$K[7],*$KPB++[2]
||	XOR	A31,$K[0],$K[0]			; ^=rcon[i]
	.if	.BIG_ENDIAN
	PACK2	$Te4[0],$Te4[1],$Te4[1]
||	PACK2	$Te4[3],A0,$Te4[3]
	PACKL4	$Te4[1],$Te4[3],$Te4[3]
||[!B0]	B	done256?
	.else
	PACK2	$Te4[1],$Te4[0],$Te4[1]
||	PACK2	$Te4[3],A0,$Te4[3]
	PACKL4	$Te4[3],$Te4[1],$Te4[3]
||[!B0]	B	done256?
	.endif
	XOR	$Te4[3],$K[0],$Te4[0]		; K[0]
	XOR	$Te4[0],$K[1],$K[1]		; K[1]
	MV	$Te4[0],$K[0]
||	XOR	$K[1],$K[2],$Te4[2]		; K[2]
	XOR	$Te4[2],$K[3],$K[3]		; K[3]

	MV	$Te4[2],$K[2]
|| [B0]	EXTU	$K[3],EXT0,24,$Te4[0]
|| [B0]	SUB	B0,1,B0
	LDBU	*${TEB}[$Te4[0]],$Te4[0]
||	MV	$K[3],A0
||	EXTU	$K[3],EXT1,24,$Te4[1]
	LDBU	*${TEB}[$Te4[1]],$Te4[1]
||	EXTU	A0,EXT2,24,A0
||	EXTU	$K[3],EXT3,24,$Te4[3]

	.if	.BIG_ENDIAN
	LDBU	*${TEA}[A0],$Te4[3]
||	LDBU	*${TEB}[$Te4[3]],A0
	NOP	3
	PACK2	$Te4[0],$Te4[1],$Te4[1]
	PACK2	$Te4[3],A0,$Te4[3]
||	B	loop256?
	PACKL4	$Te4[1],$Te4[3],$Te4[3]
	.else
	LDBU	*${TEA}[A0],A0
||	LDBU	*${TEB}[$Te4[3]],$Te4[3]
	NOP	3
	PACK2	$Te4[1],$Te4[0],$Te4[1]
	PACK2	$Te4[3],A0,$Te4[3]
||	B	loop256?
	PACKL4	$Te4[3],$Te4[1],$Te4[3]
	.endif

	XOR	$Te4[3],$K[4],$Te4[0]		; K[4]
	XOR	$Te4[0],$K[5],$K[5]		; K[5]
	MV	$Te4[0],$K[4]
||	XOR	$K[5],$K[6],$Te4[2]		; K[6]
	XOR	$Te4[2],$K[7],$K[7]		; K[7]
;;====================================================================
done256?:
	BNOP	RA
	STW	$K[0],*$KPA++[2]
||	STW	$K[1],*$KPB++[2]
	STW	$K[2],*$KPA++[2]
||	STW	$K[3],*$KPB++[2]
	MVK	14,B0				; rounds
	STW	B0,*--${KPB}[1]
	MVK	0,RET
	.endasmfunc

	.global	_AES_set_decrypt_key
_AES_set_decrypt_key:
	.asmfunc
	B	__set_encrypt_key		; guarantee local call
	MV	KEY,B30				; B30 is not modified
	MV	RA, B31				; B31 is not modified
	ADDKPC	ret?,RA,2
ret?:						; B0 holds rounds or zero
  [!B0]	BNOP	B31				; return if zero
   [B0]	SHL	B0,4,A0				; offset to last round key
   [B0]	SHRU	B0,1,B1
   [B0]	SUB	B1,1,B1
   [B0]	MVK	0x0000001B,B3			; AES polynomial
   [B0]	MVKH	0x07000000,B3

	SPLOOPD	9				; flip round keys
||	MVC	B1,ILC
||	MV	B30,$KPA
||	ADD	B30,A0,$KPB
||	MVK	16,A0				; sizeof(round key)
;;====================================================================
	LDW	*${KPA}[0],A16
||	LDW	*${KPB}[0],B16
	LDW	*${KPA}[1],A17
||	LDW	*${KPB}[1],B17
	LDW	*${KPA}[2],A18
||	LDW	*${KPB}[2],B18
	LDW	*${KPA}[3],A19
||	ADD	$KPA,A0,$KPA
||	LDW	*${KPB}[3],B19
||	SUB	$KPB,A0,$KPB
	NOP
	STW	B16,*${KPA}[-4]
||	STW	A16,*${KPB}[4]
	STW	B17,*${KPA}[-3]
||	STW	A17,*${KPB}[5]
	STW	B18,*${KPA}[-2]
||	STW	A18,*${KPB}[6]
	STW	B19,*${KPA}[-1]
||	STW	A19,*${KPB}[7]
	SPKERNEL
;;====================================================================
	SUB	B0,1,B0				; skip last round
||	ADD	B30,A0,$KPA			; skip first round
||	ADD	B30,A0,$KPB
||	MVC	GFPGFR,B30			; save GFPGFR
	LDW	*${KPA}[0],$K[0]
||	LDW	*${KPB}[1],$K[1]
||	MVC	B3,GFPGFR
	LDW	*${KPA}[2],$K[2]
||	LDW	*${KPB}[3],$K[3]
	MVK	0x00000909,A24
||	MVK	0x00000B0B,B24
	MVKH	0x09090000,A24
||	MVKH	0x0B0B0000,B24
	MVC	B0,ILC
||	SUB	B0,1,B0

	GMPY4	$K[0],A24,$Kx9[0]		; �0x09
||	GMPY4	$K[1],A24,$Kx9[1]
||	MVK	0x00000D0D,A25
||	MVK	0x00000E0E,B25
	GMPY4	$K[2],A24,$Kx9[2]
||	GMPY4	$K[3],A24,$Kx9[3]
||	MVKH	0x0D0D0000,A25
||	MVKH	0x0E0E0000,B25

	GMPY4	$K[0],B24,$KxB[0]		; �0x0B
||	GMPY4	$K[1],B24,$KxB[1]
	GMPY4	$K[2],B24,$KxB[2]
||	GMPY4	$K[3],B24,$KxB[3]

	SPLOOP	11				; InvMixColumns
;;====================================================================
	GMPY4	$K[0],A25,$KxD[0]		; �0x0D
||	GMPY4	$K[1],A25,$KxD[1]
||	SWAP2	$Kx9[0],$Kx9[0]			; rotate by 16
||	SWAP2	$Kx9[1],$Kx9[1]
||	MV	$K[0],$s[0]			; this or DINT
||	MV	$K[1],$s[1]
|| [B0]	LDW	*${KPA}[4],$K[0]
|| [B0]	LDW	*${KPB}[5],$K[1]
	GMPY4	$K[2],A25,$KxD[2]
||	GMPY4	$K[3],A25,$KxD[3]
||	SWAP2	$Kx9[2],$Kx9[2]
||	SWAP2	$Kx9[3],$Kx9[3]
||	MV	$K[2],$s[2]
||	MV	$K[3],$s[3]
|| [B0]	LDW	*${KPA}[6],$K[2]
|| [B0]	LDW	*${KPB}[7],$K[3]

	GMPY4	$s[0],B25,$KxE[0]		; �0x0E
||	GMPY4	$s[1],B25,$KxE[1]
||	XOR	$Kx9[0],$KxB[0],$KxB[0]
||	XOR	$Kx9[1],$KxB[1],$KxB[1]
	GMPY4	$s[2],B25,$KxE[2]
||	GMPY4	$s[3],B25,$KxE[3]
||	XOR	$Kx9[2],$KxB[2],$KxB[2]
||	XOR	$Kx9[3],$KxB[3],$KxB[3]

	ROTL	$KxB[0],TBL3,$KxB[0]
||	ROTL	$KxB[1],TBL3,$KxB[1]
||	SWAP2	$KxD[0],$KxD[0]			; rotate by 16
||	SWAP2	$KxD[1],$KxD[1]
	ROTL	$KxB[2],TBL3,$KxB[2]
||	ROTL	$KxB[3],TBL3,$KxB[3]
||	SWAP2	$KxD[2],$KxD[2]
||	SWAP2	$KxD[3],$KxD[3]

	XOR	$KxE[0],$KxD[0],$KxE[0]
||	XOR	$KxE[1],$KxD[1],$KxE[1]
|| [B0]	GMPY4	$K[0],A24,$Kx9[0]		; �0x09
|| [B0]	GMPY4	$K[1],A24,$Kx9[1]
||	ADDAW	$KPA,4,$KPA
	XOR	$KxE[2],$KxD[2],$KxE[2]
||	XOR	$KxE[3],$KxD[3],$KxE[3]
|| [B0]	GMPY4	$K[2],A24,$Kx9[2]
|| [B0]	GMPY4	$K[3],A24,$Kx9[3]
||	ADDAW	$KPB,4,$KPB

	XOR	$KxB[0],$KxE[0],$KxE[0]
||	XOR	$KxB[1],$KxE[1],$KxE[1]
|| [B0]	GMPY4	$K[0],B24,$KxB[0]		; �0x0B
|| [B0]	GMPY4	$K[1],B24,$KxB[1]
	XOR	$KxB[2],$KxE[2],$KxE[2]
||	XOR	$KxB[3],$KxE[3],$KxE[3]
|| [B0]	GMPY4	$K[2],B24,$KxB[2]
|| [B0]	GMPY4	$K[3],B24,$KxB[3]
||	STW	$KxE[0],*${KPA}[-4]
||	STW	$KxE[1],*${KPB}[-3]
	STW	$KxE[2],*${KPA}[-2]
||	STW	$KxE[3],*${KPB}[-1]
|| [B0]	SUB	B0,1,B0
	SPKERNEL
;;====================================================================
	BNOP	B31,3
	MVC	B30,GFPGFR			; restore GFPGFR(*)
	MVK	0,RET
	.endasmfunc
___
# (*)	Even though ABI doesn't specify GFPGFR as non-volatile, there
#	are code samples out there that *assume* its default value.
}
{
my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8");
$code.=<<___;
	.global	_AES_ctr32_encrypt
_AES_ctr32_encrypt:
	.asmfunc
	LDNDW	*${ivp}[0],A31:A30	; load counter value
||	MV	$blocks,A2		; reassign $blocks
||	DMV	RA,$key,B27:B26		; reassign RA and $key
	LDNDW	*${ivp}[1],B31:B30
||	MVK	0,B2			; don't let __encrypt load input
||	MVK	0,A1			; and postpone writing output
	.if	.BIG_ENDIAN
	NOP
	.else
	NOP	4
	SWAP2	B31,B31			; keep least significant 32 bits
	SWAP4	B31,B31			; in host byte order
	.endif
ctr32_loop?:
   [A2]	BNOP	__encrypt
|| [A1]	XOR	A29,A9,A9		; input^Ek(counter)
|| [A1]	XOR	A28,A8,A8
|| [A2]	LDNDW	*INP++,A29:A28		; load input
  [!A2]	BNOP	B27			; return
|| [A1]	XOR	B29,B9,B9
|| [A1]	XOR	B28,B8,B8
|| [A2]	LDNDW	*INP++,B29:B28
	.if	.BIG_ENDIAN
   [A1]	STNDW	A9:A8,*OUT++		; save output
|| [A2]	DMV	A31,A30,A9:A8		; pass counter value to __encrypt
   [A1]	STNDW	B9:B8,*OUT++
|| [A2]	DMV	B31,B30,B9:B8
|| [A2]	ADD	B30,1,B30		; counter++
	.else
   [A1]	STNDW	A9:A8,*OUT++		; save output
|| [A2]	DMV	A31,A30,A9:A8
|| [A2]	SWAP2	B31,B0
|| [A2]	ADD	B31,1,B31		; counter++
   [A1]	STNDW	B9:B8,*OUT++
|| [A2]	MV	B30,B8
|| [A2]	SWAP4	B0,B9
	.endif
   [A2]	ADDKPC	ctr32_loop?,RA		; return to ctr32_loop?
|| [A2]	MV	B26,KEY			; pass $key
|| [A2]	SUB	A2,1,A2			; $blocks--
||[!A1]	MVK	1,A1
	NOP
	NOP
	.endasmfunc
___
}
# Tables are kept in endian-neutral manner
$code.=<<___;
	.if	__TI_EABI__
	.sect	".text:aes_asm.const"
	.else
	.sect	".const:aes_asm"
	.endif
	.align	128
AES_Te:
	.byte	0xc6,0x63,0x63,0xa5,	0xf8,0x7c,0x7c,0x84
	.byte	0xee,0x77,0x77,0x99,	0xf6,0x7b,0x7b,0x8d
	.byte	0xff,0xf2,0xf2,0x0d,	0xd6,0x6b,0x6b,0xbd
	.byte	0xde,0x6f,0x6f,0xb1,	0x91,0xc5,0xc5,0x54
	.byte	0x60,0x30,0x30,0x50,	0x02,0x01,0x01,0x03
	.byte	0xce,0x67,0x67,0xa9,	0x56,0x2b,0x2b,0x7d
	.byte	0xe7,0xfe,0xfe,0x19,	0xb5,0xd7,0xd7,0x62
	.byte	0x4d,0xab,0xab,0xe6,	0xec,0x76,0x76,0x9a
	.byte	0x8f,0xca,0xca,0x45,	0x1f,0x82,0x82,0x9d
	.byte	0x89,0xc9,0xc9,0x40,	0xfa,0x7d,0x7d,0x87
	.byte	0xef,0xfa,0xfa,0x15,	0xb2,0x59,0x59,0xeb
	.byte	0x8e,0x47,0x47,0xc9,	0xfb,0xf0,0xf0,0x0b
	.byte	0x41,0xad,0xad,0xec,	0xb3,0xd4,0xd4,0x67
	.byte	0x5f,0xa2,0xa2,0xfd,	0x45,0xaf,0xaf,0xea
	.byte	0x23,0x9c,0x9c,0xbf,	0x53,0xa4,0xa4,0xf7
	.byte	0xe4,0x72,0x72,0x96,	0x9b,0xc0,0xc0,0x5b
	.byte	0x75,0xb7,0xb7,0xc2,	0xe1,0xfd,0xfd,0x1c
	.byte	0x3d,0x93,0x93,0xae,	0x4c,0x26,0x26,0x6a
	.byte	0x6c,0x36,0x36,0x5a,	0x7e,0x3f,0x3f,0x41
	.byte	0xf5,0xf7,0xf7,0x02,	0x83,0xcc,0xcc,0x4f
	.byte	0x68,0x34,0x34,0x5c,	0x51,0xa5,0xa5,0xf4
	.byte	0xd1,0xe5,0xe5,0x34,	0xf9,0xf1,0xf1,0x08
	.byte	0xe2,0x71,0x71,0x93,	0xab,0xd8,0xd8,0x73
	.byte	0x62,0x31,0x31,0x53,	0x2a,0x15,0x15,0x3f
	.byte	0x08,0x04,0x04,0x0c,	0x95,0xc7,0xc7,0x52
	.byte	0x46,0x23,0x23,0x65,	0x9d,0xc3,0xc3,0x5e
	.byte	0x30,0x18,0x18,0x28,	0x37,0x96,0x96,0xa1
	.byte	0x0a,0x05,0x05,0x0f,	0x2f,0x9a,0x9a,0xb5
	.byte	0x0e,0x07,0x07,0x09,	0x24,0x12,0x12,0x36
	.byte	0x1b,0x80,0x80,0x9b,	0xdf,0xe2,0xe2,0x3d
	.byte	0xcd,0xeb,0xeb,0x26,	0x4e,0x27,0x27,0x69
	.byte	0x7f,0xb2,0xb2,0xcd,	0xea,0x75,0x75,0x9f
	.byte	0x12,0x09,0x09,0x1b,	0x1d,0x83,0x83,0x9e
	.byte	0x58,0x2c,0x2c,0x74,	0x34,0x1a,0x1a,0x2e
	.byte	0x36,0x1b,0x1b,0x2d,	0xdc,0x6e,0x6e,0xb2
	.byte	0xb4,0x5a,0x5a,0xee,	0x5b,0xa0,0xa0,0xfb
	.byte	0xa4,0x52,0x52,0xf6,	0x76,0x3b,0x3b,0x4d
	.byte	0xb7,0xd6,0xd6,0x61,	0x7d,0xb3,0xb3,0xce
	.byte	0x52,0x29,0x29,0x7b,	0xdd,0xe3,0xe3,0x3e
	.byte	0x5e,0x2f,0x2f,0x71,	0x13,0x84,0x84,0x97
	.byte	0xa6,0x53,0x53,0xf5,	0xb9,0xd1,0xd1,0x68
	.byte	0x00,0x00,0x00,0x00,	0xc1,0xed,0xed,0x2c
	.byte	0x40,0x20,0x20,0x60,	0xe3,0xfc,0xfc,0x1f
	.byte	0x79,0xb1,0xb1,0xc8,	0xb6,0x5b,0x5b,0xed
	.byte	0xd4,0x6a,0x6a,0xbe,	0x8d,0xcb,0xcb,0x46
	.byte	0x67,0xbe,0xbe,0xd9,	0x72,0x39,0x39,0x4b
	.byte	0x94,0x4a,0x4a,0xde,	0x98,0x4c,0x4c,0xd4
	.byte	0xb0,0x58,0x58,0xe8,	0x85,0xcf,0xcf,0x4a
	.byte	0xbb,0xd0,0xd0,0x6b,	0xc5,0xef,0xef,0x2a
	.byte	0x4f,0xaa,0xaa,0xe5,	0xed,0xfb,0xfb,0x16
	.byte	0x86,0x43,0x43,0xc5,	0x9a,0x4d,0x4d,0xd7
	.byte	0x66,0x33,0x33,0x55,	0x11,0x85,0x85,0x94
	.byte	0x8a,0x45,0x45,0xcf,	0xe9,0xf9,0xf9,0x10
	.byte	0x04,0x02,0x02,0x06,	0xfe,0x7f,0x7f,0x81
	.byte	0xa0,0x50,0x50,0xf0,	0x78,0x3c,0x3c,0x44
	.byte	0x25,0x9f,0x9f,0xba,	0x4b,0xa8,0xa8,0xe3
	.byte	0xa2,0x51,0x51,0xf3,	0x5d,0xa3,0xa3,0xfe
	.byte	0x80,0x40,0x40,0xc0,	0x05,0x8f,0x8f,0x8a
	.byte	0x3f,0x92,0x92,0xad,	0x21,0x9d,0x9d,0xbc
	.byte	0x70,0x38,0x38,0x48,	0xf1,0xf5,0xf5,0x04
	.byte	0x63,0xbc,0xbc,0xdf,	0x77,0xb6,0xb6,0xc1
	.byte	0xaf,0xda,0xda,0x75,	0x42,0x21,0x21,0x63
	.byte	0x20,0x10,0x10,0x30,	0xe5,0xff,0xff,0x1a
	.byte	0xfd,0xf3,0xf3,0x0e,	0xbf,0xd2,0xd2,0x6d
	.byte	0x81,0xcd,0xcd,0x4c,	0x18,0x0c,0x0c,0x14
	.byte	0x26,0x13,0x13,0x35,	0xc3,0xec,0xec,0x2f
	.byte	0xbe,0x5f,0x5f,0xe1,	0x35,0x97,0x97,0xa2
	.byte	0x88,0x44,0x44,0xcc,	0x2e,0x17,0x17,0x39
	.byte	0x93,0xc4,0xc4,0x57,	0x55,0xa7,0xa7,0xf2
	.byte	0xfc,0x7e,0x7e,0x82,	0x7a,0x3d,0x3d,0x47
	.byte	0xc8,0x64,0x64,0xac,	0xba,0x5d,0x5d,0xe7
	.byte	0x32,0x19,0x19,0x2b,	0xe6,0x73,0x73,0x95
	.byte	0xc0,0x60,0x60,0xa0,	0x19,0x81,0x81,0x98
	.byte	0x9e,0x4f,0x4f,0xd1,	0xa3,0xdc,0xdc,0x7f
	.byte	0x44,0x22,0x22,0x66,	0x54,0x2a,0x2a,0x7e
	.byte	0x3b,0x90,0x90,0xab,	0x0b,0x88,0x88,0x83
	.byte	0x8c,0x46,0x46,0xca,	0xc7,0xee,0xee,0x29
	.byte	0x6b,0xb8,0xb8,0xd3,	0x28,0x14,0x14,0x3c
	.byte	0xa7,0xde,0xde,0x79,	0xbc,0x5e,0x5e,0xe2
	.byte	0x16,0x0b,0x0b,0x1d,	0xad,0xdb,0xdb,0x76
	.byte	0xdb,0xe0,0xe0,0x3b,	0x64,0x32,0x32,0x56
	.byte	0x74,0x3a,0x3a,0x4e,	0x14,0x0a,0x0a,0x1e
	.byte	0x92,0x49,0x49,0xdb,	0x0c,0x06,0x06,0x0a
	.byte	0x48,0x24,0x24,0x6c,	0xb8,0x5c,0x5c,0xe4
	.byte	0x9f,0xc2,0xc2,0x5d,	0xbd,0xd3,0xd3,0x6e
	.byte	0x43,0xac,0xac,0xef,	0xc4,0x62,0x62,0xa6
	.byte	0x39,0x91,0x91,0xa8,	0x31,0x95,0x95,0xa4
	.byte	0xd3,0xe4,0xe4,0x37,	0xf2,0x79,0x79,0x8b
	.byte	0xd5,0xe7,0xe7,0x32,	0x8b,0xc8,0xc8,0x43
	.byte	0x6e,0x37,0x37,0x59,	0xda,0x6d,0x6d,0xb7
	.byte	0x01,0x8d,0x8d,0x8c,	0xb1,0xd5,0xd5,0x64
	.byte	0x9c,0x4e,0x4e,0xd2,	0x49,0xa9,0xa9,0xe0
	.byte	0xd8,0x6c,0x6c,0xb4,	0xac,0x56,0x56,0xfa
	.byte	0xf3,0xf4,0xf4,0x07,	0xcf,0xea,0xea,0x25
	.byte	0xca,0x65,0x65,0xaf,	0xf4,0x7a,0x7a,0x8e
	.byte	0x47,0xae,0xae,0xe9,	0x10,0x08,0x08,0x18
	.byte	0x6f,0xba,0xba,0xd5,	0xf0,0x78,0x78,0x88
	.byte	0x4a,0x25,0x25,0x6f,	0x5c,0x2e,0x2e,0x72
	.byte	0x38,0x1c,0x1c,0x24,	0x57,0xa6,0xa6,0xf1
	.byte	0x73,0xb4,0xb4,0xc7,	0x97,0xc6,0xc6,0x51
	.byte	0xcb,0xe8,0xe8,0x23,	0xa1,0xdd,0xdd,0x7c
	.byte	0xe8,0x74,0x74,0x9c,	0x3e,0x1f,0x1f,0x21
	.byte	0x96,0x4b,0x4b,0xdd,	0x61,0xbd,0xbd,0xdc
	.byte	0x0d,0x8b,0x8b,0x86,	0x0f,0x8a,0x8a,0x85
	.byte	0xe0,0x70,0x70,0x90,	0x7c,0x3e,0x3e,0x42
	.byte	0x71,0xb5,0xb5,0xc4,	0xcc,0x66,0x66,0xaa
	.byte	0x90,0x48,0x48,0xd8,	0x06,0x03,0x03,0x05
	.byte	0xf7,0xf6,0xf6,0x01,	0x1c,0x0e,0x0e,0x12
	.byte	0xc2,0x61,0x61,0xa3,	0x6a,0x35,0x35,0x5f
	.byte	0xae,0x57,0x57,0xf9,	0x69,0xb9,0xb9,0xd0
	.byte	0x17,0x86,0x86,0x91,	0x99,0xc1,0xc1,0x58
	.byte	0x3a,0x1d,0x1d,0x27,	0x27,0x9e,0x9e,0xb9
	.byte	0xd9,0xe1,0xe1,0x38,	0xeb,0xf8,0xf8,0x13
	.byte	0x2b,0x98,0x98,0xb3,	0x22,0x11,0x11,0x33
	.byte	0xd2,0x69,0x69,0xbb,	0xa9,0xd9,0xd9,0x70
	.byte	0x07,0x8e,0x8e,0x89,	0x33,0x94,0x94,0xa7
	.byte	0x2d,0x9b,0x9b,0xb6,	0x3c,0x1e,0x1e,0x22
	.byte	0x15,0x87,0x87,0x92,	0xc9,0xe9,0xe9,0x20
	.byte	0x87,0xce,0xce,0x49,	0xaa,0x55,0x55,0xff
	.byte	0x50,0x28,0x28,0x78,	0xa5,0xdf,0xdf,0x7a
	.byte	0x03,0x8c,0x8c,0x8f,	0x59,0xa1,0xa1,0xf8
	.byte	0x09,0x89,0x89,0x80,	0x1a,0x0d,0x0d,0x17
	.byte	0x65,0xbf,0xbf,0xda,	0xd7,0xe6,0xe6,0x31
	.byte	0x84,0x42,0x42,0xc6,	0xd0,0x68,0x68,0xb8
	.byte	0x82,0x41,0x41,0xc3,	0x29,0x99,0x99,0xb0
	.byte	0x5a,0x2d,0x2d,0x77,	0x1e,0x0f,0x0f,0x11
	.byte	0x7b,0xb0,0xb0,0xcb,	0xa8,0x54,0x54,0xfc
	.byte	0x6d,0xbb,0xbb,0xd6,	0x2c,0x16,0x16,0x3a
AES_Te4:
	.byte	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
	.byte	0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
	.byte	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
	.byte	0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
	.byte	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
	.byte	0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
	.byte	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
	.byte	0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
	.byte	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
	.byte	0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
	.byte	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
	.byte	0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
	.byte	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
	.byte	0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
	.byte	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
	.byte	0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
	.byte	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
	.byte	0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
	.byte	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
	.byte	0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
	.byte	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
	.byte	0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
	.byte	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
	.byte	0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
	.byte	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
	.byte	0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
	.byte	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
	.byte	0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
	.byte	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
	.byte	0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
	.byte	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
	.byte	0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
rcon:
	.byte	0x01,0x00,0x00,0x00,	0x02,0x00,0x00,0x00
	.byte	0x04,0x00,0x00,0x00,	0x08,0x00,0x00,0x00
	.byte	0x10,0x00,0x00,0x00,	0x20,0x00,0x00,0x00
	.byte	0x40,0x00,0x00,0x00,	0x80,0x00,0x00,0x00
	.byte	0x1B,0x00,0x00,0x00,	0x36,0x00,0x00,0x00
	.align	128
AES_Td:
	.byte	0x51,0xf4,0xa7,0x50,	0x7e,0x41,0x65,0x53
	.byte	0x1a,0x17,0xa4,0xc3,	0x3a,0x27,0x5e,0x96
	.byte	0x3b,0xab,0x6b,0xcb,	0x1f,0x9d,0x45,0xf1
	.byte	0xac,0xfa,0x58,0xab,	0x4b,0xe3,0x03,0x93
	.byte	0x20,0x30,0xfa,0x55,	0xad,0x76,0x6d,0xf6
	.byte	0x88,0xcc,0x76,0x91,	0xf5,0x02,0x4c,0x25
	.byte	0x4f,0xe5,0xd7,0xfc,	0xc5,0x2a,0xcb,0xd7
	.byte	0x26,0x35,0x44,0x80,	0xb5,0x62,0xa3,0x8f
	.byte	0xde,0xb1,0x5a,0x49,	0x25,0xba,0x1b,0x67
	.byte	0x45,0xea,0x0e,0x98,	0x5d,0xfe,0xc0,0xe1
	.byte	0xc3,0x2f,0x75,0x02,	0x81,0x4c,0xf0,0x12
	.byte	0x8d,0x46,0x97,0xa3,	0x6b,0xd3,0xf9,0xc6
	.byte	0x03,0x8f,0x5f,0xe7,	0x15,0x92,0x9c,0x95
	.byte	0xbf,0x6d,0x7a,0xeb,	0x95,0x52,0x59,0xda
	.byte	0xd4,0xbe,0x83,0x2d,	0x58,0x74,0x21,0xd3
	.byte	0x49,0xe0,0x69,0x29,	0x8e,0xc9,0xc8,0x44
	.byte	0x75,0xc2,0x89,0x6a,	0xf4,0x8e,0x79,0x78
	.byte	0x99,0x58,0x3e,0x6b,	0x27,0xb9,0x71,0xdd
	.byte	0xbe,0xe1,0x4f,0xb6,	0xf0,0x88,0xad,0x17
	.byte	0xc9,0x20,0xac,0x66,	0x7d,0xce,0x3a,0xb4
	.byte	0x63,0xdf,0x4a,0x18,	0xe5,0x1a,0x31,0x82
	.byte	0x97,0x51,0x33,0x60,	0x62,0x53,0x7f,0x45
	.byte	0xb1,0x64,0x77,0xe0,	0xbb,0x6b,0xae,0x84
	.byte	0xfe,0x81,0xa0,0x1c,	0xf9,0x08,0x2b,0x94
	.byte	0x70,0x48,0x68,0x58,	0x8f,0x45,0xfd,0x19
	.byte	0x94,0xde,0x6c,0x87,	0x52,0x7b,0xf8,0xb7
	.byte	0xab,0x73,0xd3,0x23,	0x72,0x4b,0x02,0xe2
	.byte	0xe3,0x1f,0x8f,0x57,	0x66,0x55,0xab,0x2a
	.byte	0xb2,0xeb,0x28,0x07,	0x2f,0xb5,0xc2,0x03
	.byte	0x86,0xc5,0x7b,0x9a,	0xd3,0x37,0x08,0xa5
	.byte	0x30,0x28,0x87,0xf2,	0x23,0xbf,0xa5,0xb2
	.byte	0x02,0x03,0x6a,0xba,	0xed,0x16,0x82,0x5c
	.byte	0x8a,0xcf,0x1c,0x2b,	0xa7,0x79,0xb4,0x92
	.byte	0xf3,0x07,0xf2,0xf0,	0x4e,0x69,0xe2,0xa1
	.byte	0x65,0xda,0xf4,0xcd,	0x06,0x05,0xbe,0xd5
	.byte	0xd1,0x34,0x62,0x1f,	0xc4,0xa6,0xfe,0x8a
	.byte	0x34,0x2e,0x53,0x9d,	0xa2,0xf3,0x55,0xa0
	.byte	0x05,0x8a,0xe1,0x32,	0xa4,0xf6,0xeb,0x75
	.byte	0x0b,0x83,0xec,0x39,	0x40,0x60,0xef,0xaa
	.byte	0x5e,0x71,0x9f,0x06,	0xbd,0x6e,0x10,0x51
	.byte	0x3e,0x21,0x8a,0xf9,	0x96,0xdd,0x06,0x3d
	.byte	0xdd,0x3e,0x05,0xae,	0x4d,0xe6,0xbd,0x46
	.byte	0x91,0x54,0x8d,0xb5,	0x71,0xc4,0x5d,0x05
	.byte	0x04,0x06,0xd4,0x6f,	0x60,0x50,0x15,0xff
	.byte	0x19,0x98,0xfb,0x24,	0xd6,0xbd,0xe9,0x97
	.byte	0x89,0x40,0x43,0xcc,	0x67,0xd9,0x9e,0x77
	.byte	0xb0,0xe8,0x42,0xbd,	0x07,0x89,0x8b,0x88
	.byte	0xe7,0x19,0x5b,0x38,	0x79,0xc8,0xee,0xdb
	.byte	0xa1,0x7c,0x0a,0x47,	0x7c,0x42,0x0f,0xe9
	.byte	0xf8,0x84,0x1e,0xc9,	0x00,0x00,0x00,0x00
	.byte	0x09,0x80,0x86,0x83,	0x32,0x2b,0xed,0x48
	.byte	0x1e,0x11,0x70,0xac,	0x6c,0x5a,0x72,0x4e
	.byte	0xfd,0x0e,0xff,0xfb,	0x0f,0x85,0x38,0x56
	.byte	0x3d,0xae,0xd5,0x1e,	0x36,0x2d,0x39,0x27
	.byte	0x0a,0x0f,0xd9,0x64,	0x68,0x5c,0xa6,0x21
	.byte	0x9b,0x5b,0x54,0xd1,	0x24,0x36,0x2e,0x3a
	.byte	0x0c,0x0a,0x67,0xb1,	0x93,0x57,0xe7,0x0f
	.byte	0xb4,0xee,0x96,0xd2,	0x1b,0x9b,0x91,0x9e
	.byte	0x80,0xc0,0xc5,0x4f,	0x61,0xdc,0x20,0xa2
	.byte	0x5a,0x77,0x4b,0x69,	0x1c,0x12,0x1a,0x16
	.byte	0xe2,0x93,0xba,0x0a,	0xc0,0xa0,0x2a,0xe5
	.byte	0x3c,0x22,0xe0,0x43,	0x12,0x1b,0x17,0x1d
	.byte	0x0e,0x09,0x0d,0x0b,	0xf2,0x8b,0xc7,0xad
	.byte	0x2d,0xb6,0xa8,0xb9,	0x14,0x1e,0xa9,0xc8
	.byte	0x57,0xf1,0x19,0x85,	0xaf,0x75,0x07,0x4c
	.byte	0xee,0x99,0xdd,0xbb,	0xa3,0x7f,0x60,0xfd
	.byte	0xf7,0x01,0x26,0x9f,	0x5c,0x72,0xf5,0xbc
	.byte	0x44,0x66,0x3b,0xc5,	0x5b,0xfb,0x7e,0x34
	.byte	0x8b,0x43,0x29,0x76,	0xcb,0x23,0xc6,0xdc
	.byte	0xb6,0xed,0xfc,0x68,	0xb8,0xe4,0xf1,0x63
	.byte	0xd7,0x31,0xdc,0xca,	0x42,0x63,0x85,0x10
	.byte	0x13,0x97,0x22,0x40,	0x84,0xc6,0x11,0x20
	.byte	0x85,0x4a,0x24,0x7d,	0xd2,0xbb,0x3d,0xf8
	.byte	0xae,0xf9,0x32,0x11,	0xc7,0x29,0xa1,0x6d
	.byte	0x1d,0x9e,0x2f,0x4b,	0xdc,0xb2,0x30,0xf3
	.byte	0x0d,0x86,0x52,0xec,	0x77,0xc1,0xe3,0xd0
	.byte	0x2b,0xb3,0x16,0x6c,	0xa9,0x70,0xb9,0x99
	.byte	0x11,0x94,0x48,0xfa,	0x47,0xe9,0x64,0x22
	.byte	0xa8,0xfc,0x8c,0xc4,	0xa0,0xf0,0x3f,0x1a
	.byte	0x56,0x7d,0x2c,0xd8,	0x22,0x33,0x90,0xef
	.byte	0x87,0x49,0x4e,0xc7,	0xd9,0x38,0xd1,0xc1
	.byte	0x8c,0xca,0xa2,0xfe,	0x98,0xd4,0x0b,0x36
	.byte	0xa6,0xf5,0x81,0xcf,	0xa5,0x7a,0xde,0x28
	.byte	0xda,0xb7,0x8e,0x26,	0x3f,0xad,0xbf,0xa4
	.byte	0x2c,0x3a,0x9d,0xe4,	0x50,0x78,0x92,0x0d
	.byte	0x6a,0x5f,0xcc,0x9b,	0x54,0x7e,0x46,0x62
	.byte	0xf6,0x8d,0x13,0xc2,	0x90,0xd8,0xb8,0xe8
	.byte	0x2e,0x39,0xf7,0x5e,	0x82,0xc3,0xaf,0xf5
	.byte	0x9f,0x5d,0x80,0xbe,	0x69,0xd0,0x93,0x7c
	.byte	0x6f,0xd5,0x2d,0xa9,	0xcf,0x25,0x12,0xb3
	.byte	0xc8,0xac,0x99,0x3b,	0x10,0x18,0x7d,0xa7
	.byte	0xe8,0x9c,0x63,0x6e,	0xdb,0x3b,0xbb,0x7b
	.byte	0xcd,0x26,0x78,0x09,	0x6e,0x59,0x18,0xf4
	.byte	0xec,0x9a,0xb7,0x01,	0x83,0x4f,0x9a,0xa8
	.byte	0xe6,0x95,0x6e,0x65,	0xaa,0xff,0xe6,0x7e
	.byte	0x21,0xbc,0xcf,0x08,	0xef,0x15,0xe8,0xe6
	.byte	0xba,0xe7,0x9b,0xd9,	0x4a,0x6f,0x36,0xce
	.byte	0xea,0x9f,0x09,0xd4,	0x29,0xb0,0x7c,0xd6
	.byte	0x31,0xa4,0xb2,0xaf,	0x2a,0x3f,0x23,0x31
	.byte	0xc6,0xa5,0x94,0x30,	0x35,0xa2,0x66,0xc0
	.byte	0x74,0x4e,0xbc,0x37,	0xfc,0x82,0xca,0xa6
	.byte	0xe0,0x90,0xd0,0xb0,	0x33,0xa7,0xd8,0x15
	.byte	0xf1,0x04,0x98,0x4a,	0x41,0xec,0xda,0xf7
	.byte	0x7f,0xcd,0x50,0x0e,	0x17,0x91,0xf6,0x2f
	.byte	0x76,0x4d,0xd6,0x8d,	0x43,0xef,0xb0,0x4d
	.byte	0xcc,0xaa,0x4d,0x54,	0xe4,0x96,0x04,0xdf
	.byte	0x9e,0xd1,0xb5,0xe3,	0x4c,0x6a,0x88,0x1b
	.byte	0xc1,0x2c,0x1f,0xb8,	0x46,0x65,0x51,0x7f
	.byte	0x9d,0x5e,0xea,0x04,	0x01,0x8c,0x35,0x5d
	.byte	0xfa,0x87,0x74,0x73,	0xfb,0x0b,0x41,0x2e
	.byte	0xb3,0x67,0x1d,0x5a,	0x92,0xdb,0xd2,0x52
	.byte	0xe9,0x10,0x56,0x33,	0x6d,0xd6,0x47,0x13
	.byte	0x9a,0xd7,0x61,0x8c,	0x37,0xa1,0x0c,0x7a
	.byte	0x59,0xf8,0x14,0x8e,	0xeb,0x13,0x3c,0x89
	.byte	0xce,0xa9,0x27,0xee,	0xb7,0x61,0xc9,0x35
	.byte	0xe1,0x1c,0xe5,0xed,	0x7a,0x47,0xb1,0x3c
	.byte	0x9c,0xd2,0xdf,0x59,	0x55,0xf2,0x73,0x3f
	.byte	0x18,0x14,0xce,0x79,	0x73,0xc7,0x37,0xbf
	.byte	0x53,0xf7,0xcd,0xea,	0x5f,0xfd,0xaa,0x5b
	.byte	0xdf,0x3d,0x6f,0x14,	0x78,0x44,0xdb,0x86
	.byte	0xca,0xaf,0xf3,0x81,	0xb9,0x68,0xc4,0x3e
	.byte	0x38,0x24,0x34,0x2c,	0xc2,0xa3,0x40,0x5f
	.byte	0x16,0x1d,0xc3,0x72,	0xbc,0xe2,0x25,0x0c
	.byte	0x28,0x3c,0x49,0x8b,	0xff,0x0d,0x95,0x41
	.byte	0x39,0xa8,0x01,0x71,	0x08,0x0c,0xb3,0xde
	.byte	0xd8,0xb4,0xe4,0x9c,	0x64,0x56,0xc1,0x90
	.byte	0x7b,0xcb,0x84,0x61,	0xd5,0x32,0xb6,0x70
	.byte	0x48,0x6c,0x5c,0x74,	0xd0,0xb8,0x57,0x42
AES_Td4:
	.byte	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
	.byte	0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
	.byte	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
	.byte	0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
	.byte	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
	.byte	0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
	.byte	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
	.byte	0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
	.byte	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
	.byte	0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
	.byte	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
	.byte	0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
	.byte	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
	.byte	0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
	.byte	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
	.byte	0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
	.byte	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
	.byte	0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
	.byte	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
	.byte	0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
	.byte	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
	.byte	0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
	.byte	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
	.byte	0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
	.byte	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
	.byte	0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
	.byte	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
	.byte	0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
	.byte	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
	.byte	0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
	.byte	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
	.byte	0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
	.cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
	.align	4
___

print $code;
close STDOUT;