optimized sqrt in general and for android
Review URL: http://webrtc-codereview.appspot.com/42001 git-svn-id: http://webrtc.googlecode.com/svn/trunk@95 4adac7df-926f-26a2-2b94-8c16560cd09d
This commit is contained in:
parent
023abafa4e
commit
d99fa58ced
@ -53,7 +53,6 @@ LOCAL_SRC_FILES := add_sat_w16.c \
|
|||||||
resample_fractional.c \
|
resample_fractional.c \
|
||||||
sin_table.c \
|
sin_table.c \
|
||||||
sin_table_1024.c \
|
sin_table_1024.c \
|
||||||
spl_sqrt.c \
|
|
||||||
spl_version.c \
|
spl_version.c \
|
||||||
splitting_filter.c \
|
splitting_filter.c \
|
||||||
sqrt_of_one_minus_x_squared.c \
|
sqrt_of_one_minus_x_squared.c \
|
||||||
@ -61,6 +60,14 @@ LOCAL_SRC_FILES := add_sat_w16.c \
|
|||||||
sub_sat_w32.c \
|
sub_sat_w32.c \
|
||||||
vector_scaling_operations.c
|
vector_scaling_operations.c
|
||||||
|
|
||||||
|
ifeq ($(TARGET_ARCH), arm)
|
||||||
|
LOCAL_SRC_FILES += \
|
||||||
|
spl_sqrt.s
|
||||||
|
else
|
||||||
|
LOCAL_SRC_FILES += \
|
||||||
|
spl_sqrt.c
|
||||||
|
endif
|
||||||
|
|
||||||
# Flags passed to both C and C++ files.
|
# Flags passed to both C and C++ files.
|
||||||
MY_CFLAGS :=
|
MY_CFLAGS :=
|
||||||
MY_CFLAGS_C :=
|
MY_CFLAGS_C :=
|
||||||
|
@ -8,7 +8,6 @@
|
|||||||
* be found in the AUTHORS file in the root of the source tree.
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This file contains the function WebRtcSpl_Sqrt().
|
* This file contains the function WebRtcSpl_Sqrt().
|
||||||
* The description header can be found in signal_processing_library.h
|
* The description header can be found in signal_processing_library.h
|
||||||
@ -17,168 +16,23 @@
|
|||||||
|
|
||||||
#include "signal_processing_library.h"
|
#include "signal_processing_library.h"
|
||||||
|
|
||||||
WebRtc_Word32 WebRtcSpl_SqrtLocal(WebRtc_Word32 in);
|
#define iter1(N) \
|
||||||
|
try1 = root + (1 << (N)); \
|
||||||
|
if (value >= try1 << (N)) \
|
||||||
|
{ \
|
||||||
|
value -= try1 << (N); \
|
||||||
|
root |= 2 << (N); \
|
||||||
|
}
|
||||||
|
|
||||||
WebRtc_Word32 WebRtcSpl_SqrtLocal(WebRtc_Word32 in)
|
// (out) Square root of input parameter
|
||||||
{
|
WebRtc_Word32 WebRtcSpl_Sqrt(WebRtc_Word32 value) {
|
||||||
|
// new routine for performance, 4 cycles/bit in ARM
|
||||||
|
// output precision is 16 bits
|
||||||
|
|
||||||
WebRtc_Word16 x_half, t16;
|
WebRtc_Word32 root = 0, try1;
|
||||||
WebRtc_Word32 A, B, x2;
|
iter1 (15); iter1 (14); iter1 (13); iter1 (12);
|
||||||
|
iter1 (11); iter1 (10); iter1 ( 9); iter1 ( 8);
|
||||||
/* The following block performs:
|
iter1 ( 7); iter1 ( 6); iter1 ( 5); iter1 ( 4);
|
||||||
y=in/2
|
iter1 ( 3); iter1 ( 2); iter1 ( 1); iter1 ( 0);
|
||||||
x=y-2^30
|
return root >> 1;
|
||||||
x_half=x/2^31
|
|
||||||
t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4)
|
|
||||||
+ 0.875*((x_half)^5)
|
|
||||||
*/
|
|
||||||
|
|
||||||
B = in;
|
|
||||||
|
|
||||||
B = WEBRTC_SPL_RSHIFT_W32(B, 1); // B = in/2
|
|
||||||
B = B - ((WebRtc_Word32)0x40000000); // B = in/2 - 1/2
|
|
||||||
x_half = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(B, 16);// x_half = x/2 = (in-1)/2
|
|
||||||
B = B + ((WebRtc_Word32)0x40000000); // B = 1 + x/2
|
|
||||||
B = B + ((WebRtc_Word32)0x40000000); // Add 0.5 twice (since 1.0 does not exist in Q31)
|
|
||||||
|
|
||||||
x2 = ((WebRtc_Word32)x_half) * ((WebRtc_Word32)x_half) * 2; // A = (x/2)^2
|
|
||||||
A = -x2; // A = -(x/2)^2
|
|
||||||
B = B + (A >> 1); // B = 1 + x/2 - 0.5*(x/2)^2
|
|
||||||
|
|
||||||
A = WEBRTC_SPL_RSHIFT_W32(A, 16);
|
|
||||||
A = A * A * 2; // A = (x/2)^4
|
|
||||||
t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16);
|
|
||||||
B = B + WEBRTC_SPL_MUL_16_16(-20480, t16) * 2; // B = B - 0.625*A
|
|
||||||
// After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4
|
|
||||||
|
|
||||||
t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16);
|
|
||||||
A = WEBRTC_SPL_MUL_16_16(x_half, t16) * 2; // A = (x/2)^5
|
|
||||||
t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16);
|
|
||||||
B = B + WEBRTC_SPL_MUL_16_16(28672, t16) * 2; // B = B + 0.875*A
|
|
||||||
// After this, B = 1 + x/2 - 0.5*(x/2)^2 - 0.625*(x/2)^4 + 0.875*(x/2)^5
|
|
||||||
|
|
||||||
t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(x2, 16);
|
|
||||||
A = WEBRTC_SPL_MUL_16_16(x_half, t16) * 2; // A = x/2^3
|
|
||||||
|
|
||||||
B = B + (A >> 1); // B = B + 0.5*A
|
|
||||||
// After this, B = 1 + x/2 - 0.5*(x/2)^2 + 0.5*(x/2)^3 - 0.625*(x/2)^4 + 0.875*(x/2)^5
|
|
||||||
|
|
||||||
B = B + ((WebRtc_Word32)32768); // Round off bit
|
|
||||||
|
|
||||||
return B;
|
|
||||||
}
|
|
||||||
|
|
||||||
WebRtc_Word32 WebRtcSpl_Sqrt(WebRtc_Word32 value)
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
Algorithm:
|
|
||||||
|
|
||||||
Six term Taylor Series is used here to compute the square root of a number
|
|
||||||
y^0.5 = (1+x)^0.5 where x = y-1
|
|
||||||
= 1+(x/2)-0.5*((x/2)^2+0.5*((x/2)^3-0.625*((x/2)^4+0.875*((x/2)^5)
|
|
||||||
0.5 <= x < 1
|
|
||||||
|
|
||||||
Example of how the algorithm works, with ut=sqrt(in), and
|
|
||||||
with in=73632 and ut=271 (even shift value case):
|
|
||||||
|
|
||||||
in=73632
|
|
||||||
y= in/131072
|
|
||||||
x=y-1
|
|
||||||
t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5)
|
|
||||||
ut=t*(1/sqrt(2))*512
|
|
||||||
|
|
||||||
or:
|
|
||||||
|
|
||||||
in=73632
|
|
||||||
in2=73632*2^14
|
|
||||||
y= in2/2^31
|
|
||||||
x=y-1
|
|
||||||
t = 1 + (x/2) - 0.5*((x/2)^2) + 0.5*((x/2)^3) - 0.625*((x/2)^4) + 0.875*((x/2)^5)
|
|
||||||
ut=t*(1/sqrt(2))
|
|
||||||
ut2=ut*2^9
|
|
||||||
|
|
||||||
which gives:
|
|
||||||
|
|
||||||
in = 73632
|
|
||||||
in2 = 1206386688
|
|
||||||
y = 0.56176757812500
|
|
||||||
x = -0.43823242187500
|
|
||||||
t = 0.74973506527313
|
|
||||||
ut = 0.53014274874797
|
|
||||||
ut2 = 2.714330873589594e+002
|
|
||||||
|
|
||||||
or:
|
|
||||||
|
|
||||||
in=73632
|
|
||||||
in2=73632*2^14
|
|
||||||
y=in2/2
|
|
||||||
x=y-2^30
|
|
||||||
x_half=x/2^31
|
|
||||||
t = 1 + (x_half) - 0.5*((x_half)^2) + 0.5*((x_half)^3) - 0.625*((x_half)^4)
|
|
||||||
+ 0.875*((x_half)^5)
|
|
||||||
ut=t*(1/sqrt(2))
|
|
||||||
ut2=ut*2^9
|
|
||||||
|
|
||||||
which gives:
|
|
||||||
|
|
||||||
in = 73632
|
|
||||||
in2 = 1206386688
|
|
||||||
y = 603193344
|
|
||||||
x = -470548480
|
|
||||||
x_half = -0.21911621093750
|
|
||||||
t = 0.74973506527313
|
|
||||||
ut = 0.53014274874797
|
|
||||||
ut2 = 2.714330873589594e+002
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
WebRtc_Word16 x_norm, nshift, t16, sh;
|
|
||||||
WebRtc_Word32 A;
|
|
||||||
|
|
||||||
WebRtc_Word16 k_sqrt_2 = 23170; // 1/sqrt2 (==5a82)
|
|
||||||
|
|
||||||
A = value;
|
|
||||||
|
|
||||||
if (A == 0)
|
|
||||||
return (WebRtc_Word32)0; // sqrt(0) = 0
|
|
||||||
|
|
||||||
sh = WebRtcSpl_NormW32(A); // # shifts to normalize A
|
|
||||||
A = WEBRTC_SPL_LSHIFT_W32(A, sh); // Normalize A
|
|
||||||
if (A < (WEBRTC_SPL_WORD32_MAX - 32767))
|
|
||||||
{
|
|
||||||
A = A + ((WebRtc_Word32)32768); // Round off bit
|
|
||||||
} else
|
|
||||||
{
|
|
||||||
A = WEBRTC_SPL_WORD32_MAX;
|
|
||||||
}
|
|
||||||
|
|
||||||
x_norm = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16); // x_norm = AH
|
|
||||||
|
|
||||||
nshift = WEBRTC_SPL_RSHIFT_W16(sh, 1); // nshift = sh>>1
|
|
||||||
nshift = -nshift; // Negate the power for later de-normalization
|
|
||||||
|
|
||||||
A = (WebRtc_Word32)WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)x_norm, 16);
|
|
||||||
A = WEBRTC_SPL_ABS_W32(A); // A = abs(x_norm<<16)
|
|
||||||
A = WebRtcSpl_SqrtLocal(A); // A = sqrt(A)
|
|
||||||
|
|
||||||
if ((-2 * nshift) == sh)
|
|
||||||
{ // Even shift value case
|
|
||||||
|
|
||||||
t16 = (WebRtc_Word16)WEBRTC_SPL_RSHIFT_W32(A, 16); // t16 = AH
|
|
||||||
|
|
||||||
A = WEBRTC_SPL_MUL_16_16(k_sqrt_2, t16) * 2; // A = 1/sqrt(2)*t16
|
|
||||||
A = A + ((WebRtc_Word32)32768); // Round off
|
|
||||||
A = A & ((WebRtc_Word32)0x7fff0000); // Round off
|
|
||||||
|
|
||||||
A = WEBRTC_SPL_RSHIFT_W32(A, 15); // A = A>>16
|
|
||||||
|
|
||||||
} else
|
|
||||||
{
|
|
||||||
A = WEBRTC_SPL_RSHIFT_W32(A, 16); // A = A>>16
|
|
||||||
}
|
|
||||||
|
|
||||||
A = A & ((WebRtc_Word32)0x0000ffff);
|
|
||||||
A = (WebRtc_Word32)WEBRTC_SPL_SHIFT_W32(A, nshift); // De-normalize the result
|
|
||||||
|
|
||||||
return A;
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,93 @@
|
|||||||
|
@
|
||||||
|
@ Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
|
||||||
|
@
|
||||||
|
@ Use of this source code is governed by a BSD-style license
|
||||||
|
@ that can be found in the LICENSE file in the root of the source
|
||||||
|
@ tree. An additional intellectual property rights grant can be found
|
||||||
|
@ in the file PATENTS. All contributing project authors may
|
||||||
|
@ be found in the AUTHORS file in the root of the source tree.
|
||||||
|
|
||||||
|
@ sqrt() routine. 3 cycles/bit, total 51 cycles.
|
||||||
|
@ IN : r0 32 bit unsigned integer
|
||||||
|
@ OUT: r0 = INT (SQRT (r0)), precision is 16 bits
|
||||||
|
@ TMP: r1, r2
|
||||||
|
|
||||||
|
.global WebRtcSpl_Sqrt
|
||||||
|
|
||||||
|
.align 2
|
||||||
|
.section .text.WebRtcSpl_Sqrt:
|
||||||
|
WebRtcSpl_Sqrt:
|
||||||
|
.fnstart
|
||||||
|
|
||||||
|
MOV r1, #3 << 30
|
||||||
|
MOV r2, #1 << 30
|
||||||
|
|
||||||
|
@ unroll for i = 0 .. 15
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 0
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 0
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 1
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 1
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 2
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 2
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 3
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 3
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 4
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 4
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 5
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 5
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 6
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 6
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 7
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 7
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 8
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 8
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 9
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 9
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 10
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 10
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 11
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 11
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 12
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 12
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 13
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 13
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 14
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 14
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
CMP r0, r2, ROR #2 * 15
|
||||||
|
SUBHS r0, r0, r2, ROR #2 * 15
|
||||||
|
ADC r2, r1, r2, LSL #1
|
||||||
|
|
||||||
|
BIC r0, r2, #3 << 30 @ for rounding add: CMP r0, r2 ADC r2, #1
|
||||||
|
|
||||||
|
.fnend
|
Loading…
x
Reference in New Issue
Block a user