isa-l/igzip/aarch64/igzip_isal_adler32_neon.S

/**********************************************************************
  Copyright(c) 2019 Arm Corporation All rights reserved.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in
      the documentation and/or other materials provided with the
      distribution.
    * Neither the name of Arm Corporation nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************/

#include "../include/aarch64_label.h"

	.arch armv8-a+crypto
	.text
	.align	3

/*
Macros
*/

.macro	declare_var_vector_reg name:req,reg:req
	\name\()_q	.req	q\reg
	\name\()_v	.req	v\reg
	\name\()_s	.req	s\reg
	\name\()_d	.req	d\reg
.endm

.macro	mod_adler	dest:req,tmp:req
	umull	\tmp\()_x,\dest,const_div1
	lsr	\tmp\()_x,\tmp\()_x,47
	msub	\dest,\tmp,const_div2,\dest
.endm

/*
	uint32_t adler32_neon(uint32_t adler32, uint8_t * start, uint32_t length);
*/
/*
Arguments list
*/
	adler32	.req	w0
	start	.req	x1
	length	.req	x2
	.global	cdecl(adler32_neon)
#ifndef __APPLE__
	.type	adler32_neon, %function
#endif
cdecl(adler32_neon):
/*
local variables
*/
	declare_var_vector_reg factor0	, 6
	declare_var_vector_reg factor1	, 7
	declare_var_vector_reg d0	, 4
	declare_var_vector_reg d1	, 5
	declare_var_vector_reg adacc	, 2
	declare_var_vector_reg s2acc	, 3
	declare_var_vector_reg zero	, 16
	declare_var_vector_reg adler	, 17
	declare_var_vector_reg sum2	, 20
	declare_var_vector_reg tmp2	, 20

	adler0	.req 	w4
	adler1	.req 	w5
	adler0_x	.req 	x4
	adler1_x	.req 	x5
	end	.req	x0
	tmp	.req 	w8
	tmp_x	.req	x8
	tmp1_x	.req	x9
	loop_cnt	.req x10
	loop_const	.req x11
	const_div1	.req w6
	const_div2	.req w7
	mov     const_div1, 32881
        movk    const_div1, 0x8007, lsl 16
        mov     const_div2, 65521
	and	adler0, adler32, 0xffff
	lsr	adler1, adler32, 16

	lsr	loop_cnt,length,5
#ifndef __APPLE__
	adrp 	x3,factors
	add  	x3,x3,:lo12:factors
#else
	adrp 	x3,factors@PAGE
	add  	x3,x3,factors@PAGEOFF
#endif
	ld1 	{factor0_v.16b-factor1_v.16b},[x3]

	add	end,start,length
	cbz	loop_cnt,final_accum32
	mov 	loop_const,173

	movi	v16.4s,0


great_than_32:
	cmp	loop_cnt,173
	csel	loop_const,loop_cnt,loop_const,le
	mov	adacc_v.16b,zero_v.16b
	mov	s2acc_v.16b,zero_v.16b
	ins	adacc_v.s[0],adler0
	ins	s2acc_v.s[0],adler1
	add	tmp_x,start,loop_const,lsl 5

accum32_neon:
	ld1 	{d0_v.16b-d1_v.16b},[start]
	add	start,start,32

	shl	tmp2_v.4s,adacc_v.4s,5
	add	s2acc_v.4s,s2acc_v.4s,tmp2_v.4s

	uaddlp	adler_v.8h,d0_v.16b
	uadalp	adler_v.8h,d1_v.16b
	uadalp	adacc_v.4s,adler_v.8h

	umull	sum2_v.8h,factor0_v.8b ,d0_v.8b
	umlal2	sum2_v.8h,factor0_v.16b,d0_v.16b
	umlal	sum2_v.8h,factor1_v.8b ,d1_v.8b
	umlal2	sum2_v.8h,factor1_v.16b,d1_v.16b
	uadalp	s2acc_v.4s,sum2_v.8h

	cmp	start,tmp_x
	bne	accum32_neon

	uaddlv	adacc_d,adacc_v.4s
	uaddlv	s2acc_d,s2acc_v.4s
	fmov 	adler0_x,adacc_d
	fmov 	adler1_x,s2acc_d

	mod_adler	adler0,tmp
	mod_adler	adler1,tmp
	sub	loop_cnt,loop_cnt,loop_const
	cbnz	loop_cnt,great_than_32

final_accum32:
	and	length,length,31
	cbz	length,end_func

accum32_body:
	cmp 	start,end
	beq 	end_func
	ldrb 	tmp,[start],1
	add	adler0,adler0,tmp
	add	adler1,adler1,adler0
	b	accum32_body

end_func:
	mod_adler	adler0,tmp
	mod_adler	adler1,tmp
	orr w0,adler0,adler1,lsl 16
	ret

#ifndef __APPLE__
	.size	adler32_neon, .-adler32_neon
	.section	.rodata.cst16,"aM",@progbits,16
#else
	.section	__TEXT,__const
#endif
	.align	4
factors:
	.quad 0x191a1b1c1d1e1f20
	.quad 0x1112131415161718
	.quad 0x090a0b0c0d0e0f10
	.quad 0x0102030405060708