/*
 *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

/*
 * lattice.c
 *
 * Contains the normalized lattice filter routines (MA and AR) for iSAC codec
 *
 */

#include "codec.h"
#include "settings.h"

/* filter the signal using normalized lattice filter */
/* MA filter */
void WebRtcIsacfix_NormLatticeFilterMa(WebRtc_Word16 orderCoef,
                                       WebRtc_Word32 *stateGQ15,
                                       WebRtc_Word16 *lat_inQ0,
                                       WebRtc_Word16 *filt_coefQ15,
                                       WebRtc_Word32 *gain_lo_hiQ17,
                                       WebRtc_Word16 lo_hi,
                                       WebRtc_Word16 *lat_outQ9)
{
  WebRtc_Word16 sthQ15[MAX_AR_MODEL_ORDER];
  WebRtc_Word16 cthQ15[MAX_AR_MODEL_ORDER];

  int u, i, ii, k, n;
  WebRtc_Word16 temp2,temp3;
  WebRtc_Word16 ord_1 = orderCoef+1;
  WebRtc_Word32 inv_cthQ16[MAX_AR_MODEL_ORDER];

  WebRtc_Word32 gain32, fQtmp;
  WebRtc_Word16 gain16;
  WebRtc_Word16 gain_sh;

  WebRtc_Word32 tmp32, tmp32b;
  WebRtc_Word32 fQ15vec[HALF_SUBFRAMELEN];
  WebRtc_Word32 gQ15[MAX_AR_MODEL_ORDER+1][HALF_SUBFRAMELEN];
  WebRtc_Word16 sh;
  WebRtc_Word16 t16a;
  WebRtc_Word16 t16b;

#define LATTICE_MUL_32_32_RSFT16(a32a, a32b, b32)                  \
  ((WebRtc_Word32)(WEBRTC_SPL_MUL(a32a, b32) + (WEBRTC_SPL_MUL_16_32_RSFT16(a32b, b32))))
  /* This macro is FORBIDDEN to use elsewhere than in two places in this file
     since it might give unpredictable results, since a general WebRtc_Word32*WebRtc_Word32
     multiplication results in a 64 bit value. The result is then shifted just
     16 steps to the right, giving need for 48 bits, i.e. in the generel case,
     it will NOT fit in a WebRtc_Word32. In the cases used in here, the WebRtc_Word32 will be
     enough, since (FOR SOME REASON!!!) the involved multiplicands aren't big
     enough to overflow a WebRtc_Word32 after shifting right 16 bits. I have compared
     the result of a multiplication between t32 and tmp32, done in two ways:

     1) Using (WebRtc_Word32) (((float)(tmp32))*((float)(tmp32b))/65536.0);

     2) Using LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);

     By running 25 files, I haven't found any bigger diff than 64 - this was in the
     case when  method 1) gave 650235648 and 2) gave 650235712.

     It might be good to investigate this further, in order to PROVE why it seems to
     work without any problems. This might be done, by using the properties of
     all reflection coefficients etc.

  */

  for (u=0;u<SUBFRAMES;u++)
  {
    /* set the Direct Form coefficients */
    temp2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(u, orderCoef);
    temp3 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(2, u)+lo_hi;

    /* compute lattice filter coefficients */
    for (ii=0; ii<orderCoef; ii++) {
      sthQ15[ii] = filt_coefQ15[temp2+ii];
    }

    WebRtcSpl_SqrtOfOneMinusXSquared(sthQ15, orderCoef, cthQ15);

    /* compute the gain */
    gain32 = gain_lo_hiQ17[temp3];
    gain_sh = WebRtcSpl_NormW32(gain32);
    gain32 = WEBRTC_SPL_LSHIFT_W32(gain32, gain_sh); //Q(17+gain_sh)

    for (k=0;k<orderCoef;k++)
    {
      gain32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], gain32); //Q15*Q(17+gain_sh)>>15 = Q(17+gain_sh)
      inv_cthQ16[k] = WebRtcSpl_DivW32W16((WebRtc_Word32)2147483647, cthQ15[k]); // 1/cth[k] in Q31/Q15 = Q16
    }
    gain16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(gain32, 16); //Q(1+gain_sh)

    /* normalized lattice filter */
    /*****************************/

    /* initial conditions */
    for (i=0;i<HALF_SUBFRAMELEN;i++)
    {
      fQ15vec[i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)], 15); //Q15
      gQ15[0][i] = WEBRTC_SPL_LSHIFT_W32((WebRtc_Word32)lat_inQ0[i + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)], 15); //Q15
    }


    fQtmp = fQ15vec[0];

    /* get the state of f&g for the first input, for all orders */
    for (i=1;i<ord_1;i++)
    {
      // Calculate f[i][0] = inv_cth[i-1]*(f[i-1][0] + sth[i-1]*stateG[i-1]);
      tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[i-1], stateGQ15[i-1]);//Q15*Q15>>15 = Q15
      tmp32b= fQtmp + tmp32; //Q15+Q15=Q15
      tmp32 = inv_cthQ16[i-1]; //Q16
      t16a = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32, 16);
      t16b = (WebRtc_Word16) (tmp32-WEBRTC_SPL_LSHIFT_W32(((WebRtc_Word32)t16a), 16));
      if (t16b<0) t16a++;
      tmp32 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
      fQtmp = tmp32; // Q15

      // Calculate g[i][0] = cth[i-1]*stateG[i-1] + sth[i-1]* f[i][0];
      tmp32  = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[i-1], stateGQ15[i-1]); //Q15*Q15>>15 = Q15
      tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[i-1], fQtmp); //Q15*Q15>>15 = Q15
      tmp32  = tmp32 + tmp32b;//Q15+Q15 = Q15
      gQ15[i][0] = tmp32; // Q15
    }

    /* filtering */
    /* save the states */
    for(k=0;k<orderCoef;k++)
    {
      for(n=0;n<HALF_SUBFRAMELEN-1;n++)
      {
        // Calculate f[k+1][n+1] = inv_cth[k]*(f[k][n+1] + sth[k]*g[k][n]);
        tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], gQ15[k][n]);//Q15*Q15>>15 = Q15
        tmp32b= fQ15vec[n+1] + tmp32; //Q15+Q15=Q15
        tmp32 = inv_cthQ16[k]; //Q16
        t16a = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(tmp32, 16);
        t16b = (WebRtc_Word16) (tmp32-WEBRTC_SPL_LSHIFT_W32(((WebRtc_Word32)t16a), 16));
        if (t16b<0) t16a++;
        tmp32 = LATTICE_MUL_32_32_RSFT16(t16a, t16b, tmp32b);
        fQ15vec[n+1] = tmp32; // Q15

        // Calculate g[k+1][n+1] = cth[k]*g[k][n] + sth[k]* f[k+1][n+1];
        tmp32  = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], gQ15[k][n]); //Q15*Q15>>15 = Q15
        tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(sthQ15[k], fQ15vec[n+1]); //Q15*Q15>>15 = Q15
        tmp32  = tmp32 + tmp32b;//Q15+Q15 = Q15
        gQ15[k+1][n+1] = tmp32; // Q15
      }
    }

    fQ15vec[0] = fQtmp;

    for(n=0;n<HALF_SUBFRAMELEN;n++)
    {
      //gain32 = WEBRTC_SPL_RSHIFT_W32(gain32, gain_sh); // Q(17+gain_sh) -> Q17
      tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(gain16, fQ15vec[n]); //Q(1+gain_sh)*Q15>>16 = Q(gain_sh)
      sh = 9-gain_sh; //number of needed shifts to reach Q9
      t16a = (WebRtc_Word16) WEBRTC_SPL_SHIFT_W32(tmp32, sh);
      lat_outQ9[n + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)] = t16a;
    }

    /* save the states */
    for (i=0;i<ord_1;i++)
    {
      stateGQ15[i] = gQ15[i][HALF_SUBFRAMELEN-1];
    }
    //process next frame
  }

  return;
}


/* ----------------AR filter-------------------------*/
/* filter the signal using normalized lattice filter */
void WebRtcIsacfix_NormLatticeFilterAr(WebRtc_Word16 orderCoef,
                                       WebRtc_Word16 *stateGQ0,
                                       WebRtc_Word32 *lat_inQ25,
                                       WebRtc_Word16 *filt_coefQ15,
                                       WebRtc_Word32 *gain_lo_hiQ17,
                                       WebRtc_Word16 lo_hi,
                                       WebRtc_Word16 *lat_outQ0)
{
  int ii,n,k,i,u;
  WebRtc_Word16 sthQ15[MAX_AR_MODEL_ORDER];
  WebRtc_Word16 cthQ15[MAX_AR_MODEL_ORDER];
  WebRtc_Word32 tmp32, tmp32_2;


  WebRtc_Word16 tmpAR;
  WebRtc_Word16 ARfQ0vec[HALF_SUBFRAMELEN];
  WebRtc_Word16 ARgQ0vec[MAX_AR_MODEL_ORDER+1];

  WebRtc_Word32 inv_gain32;
  WebRtc_Word16 inv_gain16;
  WebRtc_Word16 den16;
  WebRtc_Word16 sh;

  WebRtc_Word16 temp2,temp3;
  WebRtc_Word16 ord_1 = orderCoef+1;

  for (u=0;u<SUBFRAMES;u++)
  {
    //set the denominator and numerator of the Direct Form
    temp2 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(u, orderCoef);
    temp3 = (WebRtc_Word16)WEBRTC_SPL_MUL_16_16(2, u) + lo_hi;

    for (ii=0; ii<orderCoef; ii++) {
      sthQ15[ii] = filt_coefQ15[temp2+ii];
    }

    WebRtcSpl_SqrtOfOneMinusXSquared(sthQ15, orderCoef, cthQ15);

    /* Simulation of the 25 files shows that maximum value in
       the vector gain_lo_hiQ17[] is 441344, which means that
       it is log2((2^31)/441344) = 12.2 shifting bits from
       saturation. Therefore, it should be safe to use Q27 instead
       of Q17. */

    tmp32 = WEBRTC_SPL_LSHIFT_W32(gain_lo_hiQ17[temp3], 10); // Q27

    for (k=0;k<orderCoef;k++) {
      tmp32 = WEBRTC_SPL_MUL_16_32_RSFT15(cthQ15[k], tmp32); // Q15*Q27>>15 = Q27
    }

    sh = WebRtcSpl_NormW32(tmp32); // tmp32 is the gain
    den16 = (WebRtc_Word16) WEBRTC_SPL_SHIFT_W32(tmp32, sh-16); //Q(27+sh-16) = Q(sh+11) (all 16 bits are value bits)
    inv_gain32 = WebRtcSpl_DivW32W16((WebRtc_Word32)2147483647, den16); // 1/gain in Q31/Q(sh+11) = Q(20-sh)

    //initial conditions
    inv_gain16 = (WebRtc_Word16) WEBRTC_SPL_RSHIFT_W32(inv_gain32, 2); // 1/gain in Q(20-sh-2) = Q(18-sh)

    for (i=0;i<HALF_SUBFRAMELEN;i++)
    {

      tmp32 = WEBRTC_SPL_LSHIFT_W32(lat_inQ25[i + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)], 1); //Q25->Q26
      tmp32 = WEBRTC_SPL_MUL_16_32_RSFT16(inv_gain16, tmp32); //lat_in[]*inv_gain in (Q(18-sh)*Q26)>>16 = Q(28-sh)
      tmp32 = WEBRTC_SPL_SHIFT_W32(tmp32, -(28-sh)); // lat_in[]*inv_gain in Q0

      ARfQ0vec[i] = (WebRtc_Word16) WEBRTC_SPL_SAT(32767, tmp32, -32768); // Q0
    }

    for (i=orderCoef-1;i>=0;i--) //get the state of f&g for the first input, for all orders
    {
      tmp32 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(cthQ15[i],ARfQ0vec[0])) - (WEBRTC_SPL_MUL_16_16(sthQ15[i],stateGQ0[i])) + 16384), 15);
      tmpAR = (WebRtc_Word16) WEBRTC_SPL_SAT(32767, tmp32, -32768); // Q0

      tmp32 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(sthQ15[i],ARfQ0vec[0])) + (WEBRTC_SPL_MUL_16_16(cthQ15[i], stateGQ0[i])) + 16384), 15);
      ARgQ0vec[i+1] = (WebRtc_Word16) WEBRTC_SPL_SAT(32767, tmp32, -32768); // Q0
      ARfQ0vec[0] = tmpAR;
    }
    ARgQ0vec[0] = ARfQ0vec[0];

    for(n=0;n<HALF_SUBFRAMELEN-1;n++)
    {
      tmpAR = ARfQ0vec[n+1];
      for(k=orderCoef-1;k>=0;k--)
      {
        tmp32 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(cthQ15[k], tmpAR)) - (WEBRTC_SPL_MUL_16_16(sthQ15[k], ARgQ0vec[k])) + 16384), 15);
        tmp32_2 = WEBRTC_SPL_RSHIFT_W32(((WEBRTC_SPL_MUL_16_16(sthQ15[k], tmpAR)) + (WEBRTC_SPL_MUL_16_16(cthQ15[k], ARgQ0vec[k])) + 16384), 15);
        tmpAR   = (WebRtc_Word16) WEBRTC_SPL_SAT(32767, tmp32, -32768); // Q0

        ARgQ0vec[k+1] = (WebRtc_Word16) WEBRTC_SPL_SAT(32767, tmp32_2, -32768); // Q0

      }
      ARfQ0vec[n+1] = tmpAR;
      ARgQ0vec[0] = tmpAR;
    }

    for(n=0;n<HALF_SUBFRAMELEN;n++)
    {
      lat_outQ0[n + WEBRTC_SPL_MUL_16_16(u, HALF_SUBFRAMELEN)] = ARfQ0vec[n];
    }


    /* cannot use memcpy in the following */

    for (i=0;i<ord_1;i++)
    {
      stateGQ0[i] = ARgQ0vec[i];
    }
  }

  return;
}