//*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
#include <cstdio>
#include <iostream>
#include <fstream>

using namespace std;

class CSMatrixGenerator {
public:
   typedef enum { PDT_GAUSS=1, PDT_BERNOULLI, PDT_DBFRIENDLY } PHI_DISTR_TYPE;
   ~CSMatrixGenerator();
   static float* getCSMatrix(int m, int n, PHI_DISTR_TYPE dt);     // do NOT free returned pointer


private:
   static float *cs_phi_;    // matrix for compressive sensing
   static int cs_phi_m_, cs_phi_n_;
};

float* CSMatrixGenerator::getCSMatrix(int m, int n, PHI_DISTR_TYPE dt)
{
   assert(m <= n);

   if (cs_phi_m_!=m || cs_phi_n_!=n || cs_phi_==NULL) {
      if (cs_phi_) delete [] cs_phi_;
      cs_phi_ = new float[m*n];
   }

   #if 0 // debug - load the random matrix from a file (for reproducability of results)
      //assert(m == 176);
      //assert(n == 500);
      //const char *phi = "/u/calonder/temp/dim_red/kpca_phi.txt";
      const char *phi = "/u/calonder/temp/dim_red/debug_phi.txt";
      std::ifstream ifs(phi);
      for (size_t i=0; i<m*n; ++i) {
         if (!ifs.good()) {
            printf("[ERROR] RandomizedTree::makeRandomMeasMatrix: problem reading '%s'\n", phi);
            exit(0);
         }
         ifs >> cs_phi[i];
      }
      ifs.close();

      static bool warned=false;
      if (!warned) {
         printf("[NOTE] RT: reading %ix%i PHI matrix from '%s'...\n", m, n, phi);
         warned=true;
      }

      return;
   #endif

   float *cs_phi = cs_phi_;

   if (m == n) {
      // special case - set to 0 for safety
      memset(cs_phi, 0, m*n*sizeof(float));
      printf("[WARNING] %s:%i: square CS matrix (-> no reduction)\n", __FILE__, __LINE__);
   }
   else {
       cv::RNG rng(23);

      // par is distr param, cf 'Favorable JL Distributions' (Baraniuk et al, 2006)
      if (dt == PDT_GAUSS) {
         float par = (float)(1./m);
         for (int i=0; i<m*n; ++i)
            *cs_phi++ = (float)rng.gaussian(par);
      }
      else if (dt == PDT_BERNOULLI) {
         float par = (float)(1./sqrt((float)m));
         for (int i=0; i<m*n; ++i)
            *cs_phi++ = (rng(2)==0 ? par : -par);
      }
      else if (dt == PDT_DBFRIENDLY) {
         float par = (float)sqrt(3./m);
         for (int i=0; i<m*n; ++i) {
            int r = rng(6);
            *cs_phi++ = (r==0 ? par : (r==1 ? -par : 0.f));
         }
      }
      else
         throw("PHI_DISTR_TYPE not implemented.");
   }

   return cs_phi_;
}

CSMatrixGenerator::~CSMatrixGenerator()
{
   if (cs_phi_) delete [] cs_phi_;
   cs_phi_ = NULL;
}

float *CSMatrixGenerator::cs_phi_   = NULL;
int    CSMatrixGenerator::cs_phi_m_ = 0;
int    CSMatrixGenerator::cs_phi_n_ = 0;


inline void addVec(int size, const float* src1, const float* src2, float* dst)
{
  while(--size >= 0) {
    *dst = *src1 + *src2;
    ++dst; ++src1; ++src2;
  }
}


// sum up 50 byte vectors of length 176
// assume 4 bits max for input vector values
// final shift is 2 bits right
// temp buffer should be twice as long as signature
// sig and buffer need not be initialized
inline void sum_50t_176c(uchar **pp, uchar *sig, unsigned short *temp)
{
#if CV_SSE2
  __m128i acc, *acc1, *acc2, *acc3, *acc4, tzero;
  __m128i *ssig, *ttemp;

  ssig = (__m128i *)sig;
  ttemp = (__m128i *)temp;

  // empty ttemp[]
  tzero = _mm_set_epi32(0, 0, 0, 0);
  for (int i=0; i<22; i++)
    ttemp[i] = tzero;

  for (int j=0; j<48; j+=16)
    {
      // empty ssig[]
      for (int i=0; i<11; i++)
    ssig[i] = tzero;

      for (int i=j; i<j+16; i+=4) // 4 columns at a time, to 16
    {
      acc1 = (__m128i *)pp[i];
      acc2 = (__m128i *)pp[i+1];
      acc3 = (__m128i *)pp[i+2];
      acc4 = (__m128i *)pp[i+3];

      // add next four columns
      acc = _mm_adds_epu8(acc1[0],acc2[0]);
      acc = _mm_adds_epu8(acc,acc3[0]);
      acc = _mm_adds_epu8(acc,acc4[1]);
      ssig[0] = _mm_adds_epu8(acc,ssig[0]);
      // add four columns
      acc = _mm_adds_epu8(acc1[1],acc2[1]);
      acc = _mm_adds_epu8(acc,acc3[1]);
      acc = _mm_adds_epu8(acc,acc4[1]);
      ssig[1] = _mm_adds_epu8(acc,ssig[1]);
      // add four columns
      acc = _mm_adds_epu8(acc1[2],acc2[2]);
      acc = _mm_adds_epu8(acc,acc3[2]);
      acc = _mm_adds_epu8(acc,acc4[2]);
      ssig[2] = _mm_adds_epu8(acc,ssig[2]);
      // add four columns
      acc = _mm_adds_epu8(acc1[3],acc2[3]);
      acc = _mm_adds_epu8(acc,acc3[3]);
      acc = _mm_adds_epu8(acc,acc4[3]);
      ssig[3] = _mm_adds_epu8(acc,ssig[3]);
      // add four columns
      acc = _mm_adds_epu8(acc1[4],acc2[4]);
      acc = _mm_adds_epu8(acc,acc3[4]);
      acc = _mm_adds_epu8(acc,acc4[4]);
      ssig[4] = _mm_adds_epu8(acc,ssig[4]);
      // add four columns
      acc = _mm_adds_epu8(acc1[5],acc2[5]);
      acc = _mm_adds_epu8(acc,acc3[5]);
      acc = _mm_adds_epu8(acc,acc4[5]);
      ssig[5] = _mm_adds_epu8(acc,ssig[5]);
      // add four columns
      acc = _mm_adds_epu8(acc1[6],acc2[6]);
      acc = _mm_adds_epu8(acc,acc3[6]);
      acc = _mm_adds_epu8(acc,acc4[6]);
      ssig[6] = _mm_adds_epu8(acc,ssig[6]);
      // add four columns
      acc = _mm_adds_epu8(acc1[7],acc2[7]);
      acc = _mm_adds_epu8(acc,acc3[7]);
      acc = _mm_adds_epu8(acc,acc4[7]);
      ssig[7] = _mm_adds_epu8(acc,ssig[7]);
      // add four columns
      acc = _mm_adds_epu8(acc1[8],acc2[8]);
      acc = _mm_adds_epu8(acc,acc3[8]);
      acc = _mm_adds_epu8(acc,acc4[8]);
      ssig[8] = _mm_adds_epu8(acc,ssig[8]);
      // add four columns
      acc = _mm_adds_epu8(acc1[9],acc2[9]);
      acc = _mm_adds_epu8(acc,acc3[9]);
      acc = _mm_adds_epu8(acc,acc4[9]);
      ssig[9] = _mm_adds_epu8(acc,ssig[9]);
      // add four columns
      acc = _mm_adds_epu8(acc1[10],acc2[10]);
      acc = _mm_adds_epu8(acc,acc3[10]);
      acc = _mm_adds_epu8(acc,acc4[10]);
      ssig[10] = _mm_adds_epu8(acc,ssig[10]);
    }

      // unpack to ttemp buffer and add
      ttemp[0] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[0],tzero),ttemp[0]);
      ttemp[1] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[0],tzero),ttemp[1]);
      ttemp[2] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[1],tzero),ttemp[2]);
      ttemp[3] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[1],tzero),ttemp[3]);
      ttemp[4] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[2],tzero),ttemp[4]);
      ttemp[5] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[2],tzero),ttemp[5]);
      ttemp[6] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[3],tzero),ttemp[6]);
      ttemp[7] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[3],tzero),ttemp[7]);
      ttemp[8] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[4],tzero),ttemp[8]);
      ttemp[9] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[4],tzero),ttemp[9]);
      ttemp[10] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[5],tzero),ttemp[10]);
      ttemp[11] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[5],tzero),ttemp[11]);
      ttemp[12] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[6],tzero),ttemp[12]);
      ttemp[13] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[6],tzero),ttemp[13]);
      ttemp[14] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[7],tzero),ttemp[14]);
      ttemp[15] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[7],tzero),ttemp[15]);
      ttemp[16] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[8],tzero),ttemp[16]);
      ttemp[17] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[8],tzero),ttemp[17]);
      ttemp[18] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[9],tzero),ttemp[18]);
      ttemp[19] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[9],tzero),ttemp[19]);
      ttemp[20] = _mm_add_epi16(_mm_unpacklo_epi8(ssig[10],tzero),ttemp[20]);
      ttemp[21] = _mm_add_epi16(_mm_unpackhi_epi8(ssig[10],tzero),ttemp[21]);
    }

  // create ssignature from 16-bit result
  ssig[0] =_mm_packus_epi16(_mm_srai_epi16(ttemp[0],2),_mm_srai_epi16(ttemp[1],2));
  ssig[1] =_mm_packus_epi16(_mm_srai_epi16(ttemp[2],2),_mm_srai_epi16(ttemp[3],2));
  ssig[2] =_mm_packus_epi16(_mm_srai_epi16(ttemp[4],2),_mm_srai_epi16(ttemp[5],2));
  ssig[3] =_mm_packus_epi16(_mm_srai_epi16(ttemp[6],2),_mm_srai_epi16(ttemp[7],2));
  ssig[4] =_mm_packus_epi16(_mm_srai_epi16(ttemp[8],2),_mm_srai_epi16(ttemp[9],2));
  ssig[5] =_mm_packus_epi16(_mm_srai_epi16(ttemp[10],2),_mm_srai_epi16(ttemp[11],2));
  ssig[6] =_mm_packus_epi16(_mm_srai_epi16(ttemp[12],2),_mm_srai_epi16(ttemp[13],2));
  ssig[7] =_mm_packus_epi16(_mm_srai_epi16(ttemp[14],2),_mm_srai_epi16(ttemp[15],2));
  ssig[8] =_mm_packus_epi16(_mm_srai_epi16(ttemp[16],2),_mm_srai_epi16(ttemp[17],2));
  ssig[9] =_mm_packus_epi16(_mm_srai_epi16(ttemp[18],2),_mm_srai_epi16(ttemp[19],2));
  ssig[10] =_mm_packus_epi16(_mm_srai_epi16(ttemp[20],2),_mm_srai_epi16(ttemp[21],2));
#else
  CV_Error( CV_StsNotImplemented, "Not supported without SSE2" );
#endif
}

namespace cv
{
RandomizedTree::RandomizedTree()
  : posteriors_(NULL), posteriors2_(NULL)
{
}

RandomizedTree::~RandomizedTree()
{
   freePosteriors(3);
}

void RandomizedTree::createNodes(int num_nodes, RNG &rng)
{
  nodes_.reserve(num_nodes);
  for (int i = 0; i < num_nodes; ++i) {
    nodes_.push_back( RTreeNode((uchar)rng(RandomizedTree::PATCH_SIZE),
                                (uchar)rng(RandomizedTree::PATCH_SIZE),
                                (uchar)rng(RandomizedTree::PATCH_SIZE),
                                (uchar)rng(RandomizedTree::PATCH_SIZE)) );
  }
}

int RandomizedTree::getIndex(uchar* patch_data) const
{
  int index = 0;
  for (int d = 0; d < depth_; ++d) {
    int child_offset = nodes_[index](patch_data);
    index = 2*index + 1 + child_offset;
  }
  return index - nodes_.size();
}

void RandomizedTree::train(std::vector<BaseKeypoint> const& base_set,
                           RNG &rng, int depth, int views, size_t reduced_num_dim,
                           int num_quant_bits)
{
  PatchGenerator make_patch;
  train(base_set, rng, make_patch, depth, views, reduced_num_dim, num_quant_bits);
}

void RandomizedTree::train(std::vector<BaseKeypoint> const& base_set,
                           RNG &rng, PatchGenerator &make_patch,
                           int depth, int views, size_t reduced_num_dim,
                           int num_quant_bits)
{
  init(base_set.size(), depth, rng);

  Mat patch;

  // Estimate posterior probabilities using random affine views
  std::vector<BaseKeypoint>::const_iterator keypt_it;
  int class_id = 0;
  Size patchSize(PATCH_SIZE, PATCH_SIZE);
  for (keypt_it = base_set.begin(); keypt_it != base_set.end(); ++keypt_it, ++class_id) {
    for (int i = 0; i < views; ++i) {
      make_patch( Mat(keypt_it->image), Point(keypt_it->y, keypt_it->x ), patch, patchSize, rng );
      IplImage iplPatch = patch;
      addExample(class_id, getData(&iplPatch));
    }
  }

  finalize(reduced_num_dim, num_quant_bits);
}

void RandomizedTree::allocPosteriorsAligned(int num_leaves, int num_classes)
{
  freePosteriors(3);

  posteriors_ = new float*[num_leaves]; //(float**) malloc(num_leaves*sizeof(float*));
  for (int i=0; i<num_leaves; ++i) {
    posteriors_[i] = (float*)cvAlloc(num_classes*sizeof(posteriors_[i][0]));
    memset(posteriors_[i], 0, num_classes*sizeof(float));
  }

  posteriors2_ = new uchar*[num_leaves];
  for (int i=0; i<num_leaves; ++i) {
    posteriors2_[i] = (uchar*)cvAlloc(num_classes*sizeof(posteriors2_[i][0]));
    memset(posteriors2_[i], 0, num_classes*sizeof(uchar));
  }

  classes_ = num_classes;
}

void RandomizedTree::freePosteriors(int which)
{
   if (posteriors_ && (which&1)) {
      for (int i=0; i<num_leaves_; ++i)
         if (posteriors_[i])
            cvFree( &posteriors_[i] );
      delete [] posteriors_;
      posteriors_ = NULL;
   }

   if (posteriors2_ && (which&2)) {
      for (int i=0; i<num_leaves_; ++i)
         cvFree( &posteriors2_[i] );
      delete [] posteriors2_;
      posteriors2_ = NULL;
   }

   classes_ = -1;
}

void RandomizedTree::init(int num_classes, int depth, RNG &rng)
{
  depth_ = depth;
  num_leaves_ = 1 << depth;        // 2**d
  int num_nodes = num_leaves_ - 1; // 2**d - 1

  // Initialize probabilities and counts to 0
  allocPosteriorsAligned(num_leaves_, num_classes);      // will set classes_ correctly
  for (int i = 0; i < num_leaves_; ++i)
    memset((void*)posteriors_[i], 0, num_classes*sizeof(float));
  leaf_counts_.resize(num_leaves_);

  for (int i = 0; i < num_leaves_; ++i)
    memset((void*)posteriors2_[i], 0, num_classes*sizeof(uchar));

  createNodes(num_nodes, rng);
}

void RandomizedTree::addExample(int class_id, uchar* patch_data)
{
  int index = getIndex(patch_data);
  float* posterior = getPosteriorByIndex(index);
  ++leaf_counts_[index];
  ++posterior[class_id];
}

// returns the p% percentile of data (length n vector)
static float percentile(float *data, int n, float p)
{
   assert(n>0);
   assert(p>=0 && p<=1);
   std::vector<float> vec(data, data+n);
   sort(vec.begin(), vec.end());
   int ix = (int)(p*(n-1));
   return vec[ix];
}

void RandomizedTree::finalize(size_t reduced_num_dim, int num_quant_bits)
{
   // Normalize by number of patches to reach each leaf
   for (int index = 0; index < num_leaves_; ++index) {
      float* posterior = posteriors_[index];
      assert(posterior != NULL);
      int count = leaf_counts_[index];
      if (count != 0) {
         float normalizer = 1.0f / count;
         for (int c = 0; c < classes_; ++c) {
            *posterior *= normalizer;
            ++posterior;
         }
      }
   }
   leaf_counts_.clear();

   // apply compressive sensing
   if ((int)reduced_num_dim != classes_)
      compressLeaves(reduced_num_dim);
   else {
      static bool notified = false;
      if (!notified)
         printf("\n[OK] NO compression to leaves applied, dim=%i\n", (int)reduced_num_dim);
      notified = true;
   }

   // convert float-posteriors to char-posteriors (quantization step)
   makePosteriors2(num_quant_bits);
}

void RandomizedTree::compressLeaves(size_t reduced_num_dim)
{
   static bool warned = false;
   if (!warned) {
      printf("\n[OK] compressing leaves with phi %i x %i\n", (int)reduced_num_dim, (int)classes_);
      warned = true;
   }

   static bool warned2 = false;
   if ((int)reduced_num_dim == classes_) {
     if (!warned2)
       printf("[WARNING] RandomizedTree::compressLeaves:  not compressing because reduced_dim == classes()\n");
     warned2 = true;
     return;
   }

   // DO NOT FREE RETURNED POINTER
   float *cs_phi = CSMatrixGenerator::getCSMatrix(reduced_num_dim, classes_, CSMatrixGenerator::PDT_BERNOULLI);

   float *cs_posteriors = new float[num_leaves_ * reduced_num_dim];         // temp, num_leaves_ x reduced_num_dim
   for (int i=0; i<num_leaves_; ++i) {
      float *post = getPosteriorByIndex(i);
      float *prod = &cs_posteriors[i*reduced_num_dim];
      Mat A( reduced_num_dim, classes_, CV_32FC1, cs_phi );
      Mat X( classes_, 1, CV_32FC1, post );
      Mat Y( reduced_num_dim, 1, CV_32FC1, prod );
      Y = A*X;
   }

   // copy new posteriors
   freePosteriors(3);
   allocPosteriorsAligned(num_leaves_, reduced_num_dim);
   for (int i=0; i<num_leaves_; ++i)
      memcpy(posteriors_[i], &cs_posteriors[i*reduced_num_dim], reduced_num_dim*sizeof(float));
   classes_ = reduced_num_dim;

   delete [] cs_posteriors;
}

void RandomizedTree::makePosteriors2(int num_quant_bits)
{
   int N = (1<<num_quant_bits) - 1;

   float perc[2];
   estimateQuantPercForPosteriors(perc);

   assert(posteriors_ != NULL);
   for (int i=0; i<num_leaves_; ++i)
      quantizeVector(posteriors_[i], classes_, N, perc, posteriors2_[i]);

   // printf("makePosteriors2 quantization bounds: %.3e, %.3e (num_leaves=%i, N=%i)\n",
   //        perc[0], perc[1], num_leaves_, N);
}

void RandomizedTree::estimateQuantPercForPosteriors(float perc[2])
{
   // _estimate_ percentiles for this tree
   // TODO: do this more accurately
   assert(posteriors_ != NULL);
   perc[0] = perc[1] = .0f;
   for (int i=0; i<num_leaves_; i++) {
      perc[0] += percentile(posteriors_[i], classes_, GET_LOWER_QUANT_PERC());
      perc[1] += percentile(posteriors_[i], classes_, GET_UPPER_QUANT_PERC());
   }
   perc[0] /= num_leaves_;
   perc[1] /= num_leaves_;
}


float* RandomizedTree::getPosterior(uchar* patch_data)
{
  return const_cast<float*>(const_cast<const RandomizedTree*>(this)->getPosterior(patch_data));
}

const float* RandomizedTree::getPosterior(uchar* patch_data) const
{
  return getPosteriorByIndex( getIndex(patch_data) );
}

uchar* RandomizedTree::getPosterior2(uchar* patch_data)
{
  return const_cast<uchar*>(const_cast<const RandomizedTree*>(this)->getPosterior2(patch_data));
}

const uchar* RandomizedTree::getPosterior2(uchar* patch_data) const
{
  return getPosteriorByIndex2( getIndex(patch_data) );
}

void RandomizedTree::quantizeVector(float *vec, int dim, int N, float bnds[2], int clamp_mode)
{
   float map_bnd[2] = {0.f,(float)N};          // bounds of quantized target interval we're mapping to
   for (int k=0; k<dim; ++k, ++vec) {
      *vec = float(int((*vec - bnds[0])/(bnds[1] - bnds[0])*(map_bnd[1] - map_bnd[0]) + map_bnd[0]));
      // 0: clamp both, lower and upper values
      if (clamp_mode == 0)      *vec = (*vec<map_bnd[0]) ? map_bnd[0] : ((*vec>map_bnd[1]) ? map_bnd[1] : *vec);
      // 1: clamp lower values only
      else if (clamp_mode == 1) *vec = (*vec<map_bnd[0]) ? map_bnd[0] : *vec;
      // 2: clamp upper values only
      else if (clamp_mode == 2) *vec = (*vec>map_bnd[1]) ? map_bnd[1] : *vec;
      // 4: no clamping
      else if (clamp_mode == 4) ; // yep, nothing
      else {
         printf("clamp_mode == %i is not valid (%s:%i).\n", clamp_mode, __FILE__, __LINE__);
         exit(1);
      }
   }

}

void RandomizedTree::quantizeVector(float *vec, int dim, int N, float bnds[2], uchar *dst)
{
   int map_bnd[2] = {0, N};          // bounds of quantized target interval we're mapping to
   int tmp;
   for (int k=0; k<dim; ++k) {
      tmp = int((*vec - bnds[0])/(bnds[1] - bnds[0])*(map_bnd[1] - map_bnd[0]) + map_bnd[0]);
      *dst = (uchar)((tmp<0) ? 0 : ((tmp>N) ? N : tmp));
      ++vec;
      ++dst;
   }
}


void RandomizedTree::read(const char* file_name, int num_quant_bits)
{
  std::ifstream file(file_name, std::ifstream::binary);
  read(file, num_quant_bits);
  file.close();
}

void RandomizedTree::read(std::istream &is, int num_quant_bits)
{
  is.read((char*)(&classes_), sizeof(classes_));
  is.read((char*)(&depth_), sizeof(depth_));

  num_leaves_ = 1 << depth_;
  int num_nodes = num_leaves_ - 1;

  nodes_.resize(num_nodes);
  is.read((char*)(&nodes_[0]), num_nodes * sizeof(nodes_[0]));

  //posteriors_.resize(classes_ * num_leaves_);
  //freePosteriors(3);
  //printf("[DEBUG] reading: %i leaves, %i classes\n", num_leaves_, classes_);
  allocPosteriorsAligned(num_leaves_, classes_);
  for (int i=0; i<num_leaves_; i++)
    is.read((char*)posteriors_[i], classes_ * sizeof(*posteriors_[0]));

  // make char-posteriors from float-posteriors
  makePosteriors2(num_quant_bits);
}

void RandomizedTree::write(const char* file_name) const
{
  std::ofstream file(file_name, std::ofstream::binary);
  write(file);
  file.close();
}

void RandomizedTree::write(std::ostream &os) const
{
  if (!posteriors_) {
    printf("WARNING: Cannot write float posteriors (posteriors_ = NULL).\n");
    return;
  }

  os.write((char*)(&classes_), sizeof(classes_));
  os.write((char*)(&depth_), sizeof(depth_));

  os.write((char*)(&nodes_[0]), nodes_.size() * sizeof(nodes_[0]));
  for (int i=0; i<num_leaves_; i++) {
    os.write((char*)posteriors_[i], classes_ * sizeof(*posteriors_[0]));
  }
}


void RandomizedTree::savePosteriors(std::string url, bool append)
{
   std::ofstream file(url.c_str(), (append?std::ios::app:std::ios::out));
   for (int i=0; i<num_leaves_; i++) {
      float *post = posteriors_[i];
      char buf[20];
      for (int i=0; i<classes_; i++) {
         sprintf(buf, "%.10e", *post++);
         file << buf << ((i<classes_-1) ? " " : "");
      }
      file << std::endl;
   }
   file.close();
}

void RandomizedTree::savePosteriors2(std::string url, bool append)
{
   std::ofstream file(url.c_str(), (append?std::ios::app:std::ios::out));
   for (int i=0; i<num_leaves_; i++) {
      uchar *post = posteriors2_[i];
      for (int i=0; i<classes_; i++)
         file << int(*post++) << (i<classes_-1?" ":"");
      file << std::endl;
   }
   file.close();
}

////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

RTreeClassifier::RTreeClassifier()
  : classes_(0)
{
  posteriors_ = NULL;
}

void RTreeClassifier::train(std::vector<BaseKeypoint> const& base_set,
                            RNG &rng, int num_trees, int depth,
                            int views, size_t reduced_num_dim,
                            int num_quant_bits)
{
  PatchGenerator make_patch;
  train(base_set, rng, make_patch, num_trees, depth, views, reduced_num_dim, num_quant_bits);
}

// Single-threaded version of train(), with progress output
void RTreeClassifier::train(std::vector<BaseKeypoint> const& base_set,
                            RNG &rng, PatchGenerator &make_patch, int num_trees,
                            int depth, int views, size_t reduced_num_dim,
                            int num_quant_bits)
{
  if (reduced_num_dim > base_set.size()) {
    printf("INVALID PARAMS in RTreeClassifier::train: reduced_num_dim{%i} > base_set.size(){%i}\n",
           (int)reduced_num_dim, (int)base_set.size());
    return;
  }

  num_quant_bits_ = num_quant_bits;
  classes_ = reduced_num_dim; // base_set.size();
  original_num_classes_ = base_set.size();
  trees_.resize(num_trees);

  printf("[OK] Training trees: base size=%i, reduced size=%i\n", (int)base_set.size(), (int)reduced_num_dim);

  int count = 1;
  printf("[OK] Trained 0 / %i trees", num_trees);  fflush(stdout);
  for( int ti = 0; ti < num_trees; ti++ ) {
    trees_[ti].train(base_set, rng, make_patch, depth, views, reduced_num_dim, num_quant_bits_);
    printf("\r[OK] Trained %i / %i trees", count++, num_trees);
    fflush(stdout);
  }

  printf("\n");
  countZeroElements();
  printf("\n\n");
}

void RTreeClassifier::getSignature(IplImage* patch, float *sig) const
{
  // Need pointer to 32x32 patch data
  uchar buffer[RandomizedTree::PATCH_SIZE * RandomizedTree::PATCH_SIZE];
  uchar* patch_data;
  if (patch->widthStep != RandomizedTree::PATCH_SIZE) {
    //printf("[INFO] patch is padded, data will be copied (%i/%i).\n",
    //       patch->widthStep, RandomizedTree::PATCH_SIZE);
    uchar* data = getData(patch);
    patch_data = buffer;
    for (int i = 0; i < RandomizedTree::PATCH_SIZE; ++i) {
      memcpy((void*)patch_data, (void*)data, RandomizedTree::PATCH_SIZE);
      data += patch->widthStep;
      patch_data += RandomizedTree::PATCH_SIZE;
    }
    patch_data = buffer;
  }
  else {
    patch_data = getData(patch);
  }

  memset((void*)sig, 0, classes_ * sizeof(float));
  std::vector<RandomizedTree>::const_iterator tree_it;

  // get posteriors
  float **posteriors = new float*[trees_.size()];  // TODO: move alloc outside this func
  float **pp = posteriors;
  for (tree_it = trees_.begin(); tree_it != trees_.end(); ++tree_it, pp++) {
    *pp = const_cast<float*>(tree_it->getPosterior(patch_data));
    assert(*pp != NULL);
  }

  // sum them up
  pp = posteriors;
  for (tree_it = trees_.begin(); tree_it != trees_.end(); ++tree_it, pp++)
    addVec(classes_, sig, *pp, sig);

  delete [] posteriors;
  posteriors = NULL;

  // full quantization (experimental)
  #if 0
    int n_max = 1<<8 - 1;
    int sum_max = (1<<4 - 1)*trees_.size();
    int shift = 0;
    while ((sum_max>>shift) > n_max) shift++;

    for (int i = 0; i < classes_; ++i) {
      sig[i] = int(sig[i] + .5) >> shift;
      if (sig[i]>n_max) sig[i] = n_max;
    }

    static bool warned = false;
    if (!warned) {
      printf("[WARNING] Using full quantization (RTreeClassifier::getSignature)! shift=%i\n", shift);
      warned = true;
    }
  #else
    // TODO: get rid of this multiply (-> number of trees is known at train
    // time, exploit it in RandomizedTree::finalize())
    float normalizer = 1.0f / trees_.size();
    for (int i = 0; i < classes_; ++i)
      sig[i] *= normalizer;
  #endif
}

void RTreeClassifier::getSignature(IplImage* patch, uchar *sig) const
{
  // Need pointer to 32x32 patch data
  uchar buffer[RandomizedTree::PATCH_SIZE * RandomizedTree::PATCH_SIZE];
  uchar* patch_data;
  if (patch->widthStep != RandomizedTree::PATCH_SIZE) {
    //printf("[INFO] patch is padded, data will be copied (%i/%i).\n",
    //       patch->widthStep, RandomizedTree::PATCH_SIZE);
    uchar* data = getData(patch);
    patch_data = buffer;
    for (int i = 0; i < RandomizedTree::PATCH_SIZE; ++i) {
      memcpy((void*)patch_data, (void*)data, RandomizedTree::PATCH_SIZE);
      data += patch->widthStep;
      patch_data += RandomizedTree::PATCH_SIZE;
    }
    patch_data = buffer;
  } else {
    patch_data = getData(patch);
  }

  std::vector<RandomizedTree>::const_iterator tree_it;

  // get posteriors
  if (posteriors_ == NULL)
    {
      posteriors_ = (uchar**)cvAlloc( trees_.size()*sizeof(posteriors_[0]) );
      ptemp_ = (unsigned short*)cvAlloc( classes_*sizeof(ptemp_[0]) );
    }
  /// @todo What is going on in the next 4 lines?
  uchar **pp = posteriors_;
  for (tree_it = trees_.begin(); tree_it != trees_.end(); ++tree_it, pp++)
    *pp = const_cast<uchar*>(tree_it->getPosterior2(patch_data));
  pp = posteriors_;

#if 1
     // SSE2 optimized code
     sum_50t_176c(pp, sig, ptemp_);    // sum them up
#else
     static bool warned = false;

     memset((void*)sig, 0, classes_ * sizeof(sig[0]));
     unsigned short *sig16 = new unsigned short[classes_];           // TODO: make member, no alloc here
     memset((void*)sig16, 0, classes_ * sizeof(sig16[0]));
     for (tree_it = trees_.begin(); tree_it != trees_.end(); ++tree_it, pp++)
       addVec(classes_, sig16, *pp, sig16);

     // squeeze signatures into an uchar
     const bool full_shifting = true;
     int shift;
     if (full_shifting) {
        float num_add_bits_f = log((float)trees_.size())/log(2.f);     // # additional bits required due to summation
        int num_add_bits = int(num_add_bits_f);
        if (num_add_bits_f != float(num_add_bits)) ++num_add_bits;
        shift = num_quant_bits_ + num_add_bits - 8*sizeof(uchar);
//shift = num_quant_bits_ + num_add_bits - 2;
//shift = 6;
        if (shift>0)
          for (int i = 0; i < classes_; ++i)
            sig[i] = (sig16[i] >> shift);      // &3 cut off all but lowest 2 bits, 3(dec) = 11(bin)

        if (!warned)
           printf("[OK] RTC: quantizing by FULL RIGHT SHIFT, shift = %i\n", shift);
     }
     else {
        printf("[ERROR] RTC: not implemented!\n");
        exit(0);
     }

     if (!warned)
        printf("[WARNING] RTC: unoptimized signature computation\n");
     warned = true;
#endif
}


void RTreeClassifier::getSparseSignature(IplImage *patch, float *sig, float thresh) const
{
   getFloatSignature(patch, sig);
   for (int i=0; i<classes_; ++i, sig++)
      if (*sig < thresh) *sig = 0.f;
}

int RTreeClassifier::countNonZeroElements(float *vec, int n, double tol)
{
   int res = 0;
   while (n-- > 0)
      res += (fabs(*vec++) > tol);
   return res;
}

void RTreeClassifier::read(const char* file_name)
{
  std::ifstream file(file_name, std::ifstream::binary);
  read(file);
  file.close();
}

void RTreeClassifier::read(std::istream &is)
{
  int num_trees = 0;
  is.read((char*)(&num_trees), sizeof(num_trees));
  is.read((char*)(&classes_), sizeof(classes_));
  is.read((char*)(&original_num_classes_), sizeof(original_num_classes_));
  is.read((char*)(&num_quant_bits_), sizeof(num_quant_bits_));

  if (num_quant_bits_<1 || num_quant_bits_>8) {
    printf("[WARNING] RTC: suspicious value num_quant_bits_=%i found; setting to %i.\n",
           num_quant_bits_, (int)DEFAULT_NUM_QUANT_BITS);
    num_quant_bits_ = DEFAULT_NUM_QUANT_BITS;
  }

  trees_.resize(num_trees);
  std::vector<RandomizedTree>::iterator tree_it;

  for (tree_it = trees_.begin(); tree_it != trees_.end(); ++tree_it) {
    tree_it->read(is, num_quant_bits_);
  }

  printf("[OK] Loaded RTC, quantization=%i bits\n", num_quant_bits_);

  countZeroElements();
}

void RTreeClassifier::write(const char* file_name) const
{
  std::ofstream file(file_name, std::ofstream::binary);
  write(file);
  file.close();
}

void RTreeClassifier::write(std::ostream &os) const
{
  int num_trees = trees_.size();
  os.write((char*)(&num_trees), sizeof(num_trees));
  os.write((char*)(&classes_), sizeof(classes_));
  os.write((char*)(&original_num_classes_), sizeof(original_num_classes_));
  os.write((char*)(&num_quant_bits_), sizeof(num_quant_bits_));
printf("RTreeClassifier::write: num_quant_bits_=%i\n", num_quant_bits_);

  std::vector<RandomizedTree>::const_iterator tree_it;
  for (tree_it = trees_.begin(); tree_it != trees_.end(); ++tree_it)
    tree_it->write(os);
}

void RTreeClassifier::saveAllFloatPosteriors(std::string url)
{
  printf("[DEBUG] writing all float posteriors to %s...\n", url.c_str());
  for (int i=0; i<(int)trees_.size(); ++i)
    trees_[i].savePosteriors(url, (i==0 ? false : true));
  printf("[DEBUG] done\n");
}

void RTreeClassifier::saveAllBytePosteriors(std::string url)
{
  printf("[DEBUG] writing all byte posteriors to %s...\n", url.c_str());
  for (int i=0; i<(int)trees_.size(); ++i)
    trees_[i].savePosteriors2(url, (i==0 ? false : true));
  printf("[DEBUG] done\n");
}


void RTreeClassifier::setFloatPosteriorsFromTextfile_176(std::string url)
{
   std::ifstream ifs(url.c_str());

   for (int i=0; i<(int)trees_.size(); ++i) {
      int num_classes = trees_[i].classes_;
      assert(num_classes == 176);     // TODO: remove this limitation (arose due to SSE2 optimizations)
      for (int k=0; k<trees_[i].num_leaves_; ++k) {
         float *post = trees_[i].getPosteriorByIndex(k);
         for (int j=0; j<num_classes; ++j, ++post)
            ifs >> *post;
         assert(ifs.good());
      }
   }
   classes_ = 176;

   //setQuantization(num_quant_bits_);

   ifs.close();
   printf("[EXPERIMENTAL] read entire tree from '%s'\n", url.c_str());
}


float RTreeClassifier::countZeroElements()
{
   int flt_zeros = 0;
   int ui8_zeros = 0;
   int num_elem = trees_[0].classes();
   for (int i=0; i<(int)trees_.size(); ++i)
      for (int k=0; k<(int)trees_[i].num_leaves_; ++k) {
         float *p = trees_[i].getPosteriorByIndex(k);
         uchar *p2 = trees_[i].getPosteriorByIndex2(k);
         assert(p); assert(p2);
         for (int j=0; j<num_elem; ++j, ++p, ++p2) {
            if (*p == 0.f) flt_zeros++;
            if (*p2 == 0) ui8_zeros++;
         }
      }
   num_elem = trees_.size()*trees_[0].num_leaves_*num_elem;
   float flt_perc = 100.f*flt_zeros/num_elem;
   float ui8_perc = 100.f*ui8_zeros/num_elem;
   printf("[OK] RTC: overall %i/%i (%.3f%%) zeros in float leaves\n", flt_zeros, num_elem, flt_perc);
   printf("          overall %i/%i (%.3f%%) zeros in uint8 leaves\n", ui8_zeros, num_elem, ui8_perc);

   return flt_perc;
}

void RTreeClassifier::setQuantization(int num_quant_bits)
{
   for (int i=0; i<(int)trees_.size(); ++i)
      trees_[i].applyQuantization(num_quant_bits);

   printf("[OK] signature quantization is now %i bits (before: %i)\n", num_quant_bits, num_quant_bits_);
   num_quant_bits_ = num_quant_bits;
}

void RTreeClassifier::discardFloatPosteriors()
{
   for (int i=0; i<(int)trees_.size(); ++i)
      trees_[i].discardFloatPosteriors();
   printf("[OK] RTC: discarded float posteriors of all trees\n");
}

#if 0
const int progressBarSize = 50;

CalonderClassifier::CalonderClassifier()
{
    verbose = false;
    clear();
}

CalonderClassifier::~CalonderClassifier()
{}

CalonderClassifier::CalonderClassifier( const vector<vector<Point2f> >& points, const vector<Mat>& refimgs,
                                        const vector<vector<int> >& labels, int _numClasses,
                                        int _pathSize, int _numTrees, int _treeDepth,
                                        int _numViews, int _compressedDim, int _compressType, int _numQuantBits,
                                        const PatchGenerator &patchGenerator )
{
    verbose = false;
    train( points, refimgs, labels, _numClasses, _pathSize, _numTrees, _treeDepth, _numViews,
           _compressedDim, _compressType, _numQuantBits, patchGenerator );
}

int CalonderClassifier::getPatchSize() const
{ return patchSize; }

int CalonderClassifier::getNumTrees() const
{ return numTrees; }

int CalonderClassifier::getTreeDepth() const
{ return treeDepth; }

int CalonderClassifier::getNumViews() const
{ return numViews; }

int CalonderClassifier::getSignatureSize() const
{ return signatureSize; }

int CalonderClassifier::getCompressType() const
{ return compressType; }

int CalonderClassifier::getNumQuantBits() const
{ return numQuantBits; }

int CalonderClassifier::getOrigNumClasses() const
{ return origNumClasses; }

void CalonderClassifier::setVerbose( bool _verbose )
{
    verbose = _verbose;
}

void CalonderClassifier::clear()
{
    patchSize = numTrees = origNumClasses = signatureSize = treeDepth = numViews = numQuantBits = 0;
    compressType = COMPRESS_NONE;

    nodes.clear();
    posteriors.clear();
#if QUANTIZATION_AVAILABLE
    quantizedPosteriors.clear();
#endif
}

bool CalonderClassifier::empty() const
{
    return posteriors.empty() && quantizedPosteriors.empty();
}

void CalonderClassifier::prepare( int _patchSize, int _signatureSize, int _numTrees, int _treeDepth, int _numViews )
{
    clear();

    patchSize = _patchSize;
    signatureSize = _signatureSize;
    numTrees = _numTrees;
    treeDepth = _treeDepth;
    numViews = _numViews;

    numLeavesPerTree = 1 << treeDepth;      // 2^d
    numNodesPerTree = numLeavesPerTree - 1; // 2^d - 1

    nodes = vector<Node>( numTrees*numNodesPerTree );
    posteriors = vector<float>( numTrees*numLeavesPerTree*signatureSize, 0.f );
}

static int calcNumPoints( const vector<vector<Point2f> >& points )
{
    int count = 0;
    for( size_t i = 0; i < points.size(); i++ )
        count += points[i].size();
    return count;
}

void CalonderClassifier::train( const vector<vector<Point2f> >& points, const vector<Mat>& refimgs,
                                const vector<vector<int> >& labels, int _numClasses,
                                int _patchSize, int _numTrees, int _treeDepth, int _numViews,
                                int _compressedDim, int _compressType, int _numQuantBits,
                                const PatchGenerator &patchGenerator )
{
    if( points.empty() || refimgs.size() != points.size() )
        CV_Error( CV_StsBadSize, "points vector must be no empty and refimgs must have the same size as points" );
    if( _patchSize < 5 || _patchSize >= 256 )
        CV_Error( CV_StsBadArg, "patchSize must be in [5, 255]");
    if( _numTrees <= 0 || _treeDepth <= 0 )
        CV_Error( CV_StsBadArg, "numTrees, treeDepth, numViews must be positive");
    int numPoints = calcNumPoints( points );
    if( !labels.empty() && ( labels.size() != points.size() || _numClasses <=0 || _numClasses > numPoints ) )
        CV_Error( CV_StsBadArg, "labels has incorrect size or _numClasses is not in [1, numPoints]");
    _numViews = std::max( 1, _numViews );

    int _origNumClasses = labels.empty() ? numPoints : _numClasses;

    if( verbose )
    {
        cout << "Using train parameters:" << endl;
        cout << "   patchSize=" << _patchSize << endl;
        cout << "   numTrees=" << _numTrees << endl;
        cout << "   treeDepth=" << _treeDepth << endl;
        cout << "   numViews=" << _numViews << endl;
        cout << "   compressedDim=" << _compressedDim << endl;
        cout << "   compressType=" << _compressType << endl;
        cout << "   numQuantBits=" << _numQuantBits << endl;
        cout << endl
             << "   numPoints=" << numPoints << endl;
        cout << "   origNumClasses=" << _origNumClasses << endl;
    }

    prepare( _patchSize, _origNumClasses, _numTrees, _treeDepth, _numViews );

    origNumClasses = _origNumClasses;
    vector<int> leafSampleCounters = vector<int>( numTrees*numLeavesPerTree, 0 );
    // generate nodes
    RNG rng = theRNG();
    for( int i = 0; i < numTrees*numNodesPerTree; i++ )
    {
        uchar x1 = rng(_patchSize);
        uchar y1 = rng(_patchSize);
        uchar x2 = rng(_patchSize);
        uchar y2 = rng(_patchSize);
        nodes[i] = Node(x1, y1, x2, y2);
    }

    Size size( patchSize, patchSize );
    Mat patch;
    if( verbose ) cout << "START training..." << endl;
    for( size_t treeIdx = 0; treeIdx < (size_t)numTrees; treeIdx++ )
    {
        if( verbose ) cout << "< tree " << treeIdx << endl;
        int globalPointIdx = 0;
        int* treeLeafSampleCounters = &leafSampleCounters[treeIdx*numLeavesPerTree];
        float* treePosteriors = &posteriors[treeIdx*numLeavesPerTree*signatureSize];
        for( size_t imgIdx = 0; imgIdx < points.size(); imgIdx++ )
        {
            const Point2f* imgPoints = &points[imgIdx][0];
            const int* imgLabels = labels.empty() ? 0 : &labels[imgIdx][0];
            int last = -1, cur;
            for( size_t pointIdx = 0; pointIdx < points[imgIdx].size(); pointIdx++, globalPointIdx++ )
            {
                int classID = imgLabels==0 ? globalPointIdx : imgLabels[pointIdx];
                Point2f pt = imgPoints[pointIdx];
                const Mat& src = refimgs[imgIdx];

                if( verbose && (cur = (int)((float)globalPointIdx/numPoints*progressBarSize)) != last )
                {
                    last = cur;
                    cout << ".";
                    cout.flush();
                }

                CV_Assert( classID >= 0 && classID < signatureSize );
                for( int v = 0; v < numViews; v++ )
                {
                    patchGenerator( src, pt, patch, size, rng );
                    // add sample
                    int leafIdx = getLeafIdx( treeIdx, patch );
                    treeLeafSampleCounters[leafIdx]++;
                    treePosteriors[leafIdx*signatureSize + classID]++;
                }
            }
        }

        if( verbose ) cout << endl << ">" << endl;
    }

    _compressedDim = std::max( 0, std::min(signatureSize, _compressedDim) );
    _numQuantBits = std::max( 0, std::min((int)MAX_NUM_QUANT_BITS, _numQuantBits) );
    finalize( _compressedDim, _compressType, _numQuantBits, leafSampleCounters );

    if( verbose ) cout << "END training." << endl;
}

int CalonderClassifier::getLeafIdx( int treeIdx, const Mat& patch ) const
{
    const Node* treeNodes = &nodes[treeIdx*numNodesPerTree];
    int idx = 0;
    for( int d = 0; d < treeDepth-1; d++ )
    {
        int offset = treeNodes[idx](patch);
        idx = 2*idx + 1 + offset;
    }
    return idx;
}

void CalonderClassifier::finalize( int _compressedDim, int _compressType, int _numQuantBits,
                                   const vector<int>& leafSampleCounters )
{
    for( int ti = 0; ti < numTrees; ti++ )
    {
        const int* treeLeafSampleCounters = &leafSampleCounters[ti*numLeavesPerTree];
        float* treePosteriors = &posteriors[ti*numLeavesPerTree*signatureSize];
        // Normalize by number of patches to reach each leaf
        for( int li = 0; li < numLeavesPerTree; li++ )
        {
            int sampleCount = treeLeafSampleCounters[li];
            if( sampleCount != 0 )
            {
                float normalizer = 1.0f / sampleCount;
                int leafPosteriorIdx = li*signatureSize;
                for( int ci = 0; ci < signatureSize; ci++ )
                   treePosteriors[leafPosteriorIdx + ci] *= normalizer;
             }
        }
    }

    // apply compressive sensing
    if( _compressedDim > 0 && _compressedDim < signatureSize )
        compressLeaves( _compressedDim, _compressType );
    else
    {
        if( verbose )
            cout << endl << "[WARNING] NO compression to leaves applied, because _compressedDim=" << _compressedDim << endl;
    }

    // convert float-posteriors to uchar-posteriors (quantization step)
#if QUANTIZATION_AVAILABLE
    if( _numQuantBits > 0 )
        quantizePosteriors( _numQuantBits );
    else
    {
        if( verbose )
            cout << endl << "[WARNING] NO quantization to posteriors, because _numQuantBits=" << _numQuantBits << endl;
    }
#endif
}

Mat createCompressionMatrix( int rows, int cols, int distrType )
{
    Mat mtr( rows, cols, CV_32FC1 );
    assert( rows <= cols );

    RNG rng(23);

    if( distrType == CalonderClassifier::COMPRESS_DISTR_GAUSS )
    {
        float sigma = 1./rows;
        for( int y = 0; y < rows; y++ )
            for( int x = 0; x < cols; x++ )
                mtr.at<float>(y,x) = rng.gaussian( sigma );
    }
    else if( distrType == CalonderClassifier::COMPRESS_DISTR_BERNOULLI )
    {
        float par = (float)(1./sqrt((float)rows));
        for( int y = 0; y < rows; y++ )
            for( int x = 0; x < cols; x++ )
                mtr.at<float>(y,x) = rng(2)==0 ? par : -par;
    }
    else if( distrType == CalonderClassifier::COMPRESS_DISTR_DBFRIENDLY )
    {
        float par = (float)sqrt(3./rows);
        for( int y = 0; y < rows; y++ )
            for( int x = 0; x < cols; x++ )
            {
                int rng6 = rng(6);
                mtr.at<float>(y,x) = rng6==0 ? par : (rng6==1 ? -par : 0.f);
            }
    }
    else
        CV_Assert( 0 );

    return mtr;
}

void CalonderClassifier::compressLeaves( int _compressedDim, int _compressType )
{
    if( verbose )
        cout << endl << "[OK] compressing leaves with matrix " << _compressedDim << " x " << signatureSize << endl;

    Mat compressionMtrT = (createCompressionMatrix( _compressedDim, signatureSize, _compressType )).t();

    vector<float> comprPosteriors( numTrees*numLeavesPerTree*_compressedDim, 0);
    Mat( numTrees*numLeavesPerTree, _compressedDim, CV_32FC1, &comprPosteriors[0] ) =
            Mat( numTrees*numLeavesPerTree, signatureSize, CV_32FC1, &posteriors[0]) * compressionMtrT;

    posteriors.resize( comprPosteriors.size() );
    copy( comprPosteriors.begin(), comprPosteriors.end(), posteriors.begin() );

    signatureSize = _compressedDim;
    compressType = _compressType;
}

#if QUANTIZATION_AVAILABLE
static float percentile( const float* data, int n, float p )
{
   assert( n>0 );
   assert( p>=0 && p<=1 );

   vector<float> vec( data, data+n );
   sort(vec.begin(), vec.end());
   int ix = (int)(p*(n-1));
   return vec[ix];
}

void quantizeVector( const float* src, int dim, float fbounds[2], uchar ubounds[2], uchar* dst )
{
    assert( fbounds[0] < fbounds[1] );
    assert( ubounds[0] < ubounds[1] );

    float normFactor = 1.f/(fbounds[1] - fbounds[0]);
    for( int i = 0; i < dim; i++ )
    {
        float part = (src[i] - fbounds[0]) * normFactor;
        assert( 0 <= part && part <= 1 ) ;
        uchar val = ubounds[0] +  (uchar)( part*ubounds[1] );
        dst[i] = std::max( 0, (int)std::min(ubounds[1], val) );
    }
}

void CalonderClassifier::quantizePosteriors( int _numQuantBits, bool isClearFloatPosteriors )
{
    uchar ubounds[] = { 0, (uchar)((1<<_numQuantBits)-1) };
    float fbounds[] = { 0.f, 0.f };

    int totalLeavesCount = numTrees*numLeavesPerTree;
    for( int li = 0; li < totalLeavesCount; li++ ) // TODO for some random choosen leaves !
    {
        fbounds[0] += percentile( &posteriors[li*signatureSize], signatureSize, GET_LOWER_QUANT_PERC() );
        fbounds[1] += percentile( &posteriors[li*signatureSize], signatureSize, GET_UPPER_QUANT_PERC() );
    }
    fbounds[0] /= totalLeavesCount;
    fbounds[1] /= totalLeavesCount;

    quantizedPosteriors.resize( posteriors.size() );
    quantizeVector( &posteriors[0], posteriors.size(), fbounds, ubounds, &quantizedPosteriors[0] );

    if( isClearFloatPosteriors )
        clearFloatPosteriors();
}

void CalonderClassifier::clearFloatPosteriors()
{
    quantizedPosteriors.clear();
}

#endif

void CalonderClassifier::operator()( const Mat& img, Point2f pt, vector<float>& signature, float thresh ) const
{
    if( img.empty() || img.type() != CV_8UC1 )
        return;

    Mat patch;
    getRectSubPix(img, Size(patchSize,patchSize), pt, patch, img.type());
    (*this)( patch, signature, thresh );
}

void CalonderClassifier::operator()( const Mat& patch, vector<float>& signature, float thresh ) const
{
    if( posteriors.empty() || patch.empty() || patch.type() != CV_8UC1 || patch.cols < patchSize || patch.rows < patchSize )
        return;

    int treePostSize = numLeavesPerTree*signatureSize;

    signature.resize( signatureSize, 0.f );
    float* sig = &signature[0];
    for( int ti = 0; ti < numTrees; ti++ )
    {
        int leafIdx = getLeafIdx( ti, patch );
        const float* post = &posteriors[ti*treePostSize + leafIdx*signatureSize];
        for( int ci = 0; ci < signatureSize; ci++ )
            sig[ci] += post[ci];
    }
    float coef = 1.f/numTrees;
    for( int ci = 0; ci < signatureSize; ci++ )
    {
        sig[ci] *= coef;
        if( sig[ci] < thresh )
            sig[ci] = 0;
    }
}

#if QUANTIZATION_AVAILABLE
void CalonderClassifier::operator()( const Mat& img, Point2f pt, vector<uchar>& signature, uchar thresh ) const
{
    if( img.empty() || img.type() != CV_8UC1 )
        return;

    Mat patch;
    getRectSubPix(img, Size(patchSize,patchSize), pt, patch, img.type());
    (*this)(patch, signature, thresh );
}

void CalonderClassifier::operator()( const Mat& patch, vector<uchar>& signature, uchar thresh ) const
{
    if( quantizedPosteriors.empty() || patch.empty() || patch.type() != CV_8UC1 || patch.cols > patchSize || patch.rows > patchSize )
        return;

    int treePostSize = numLeavesPerTree*signatureSize;

    vector<float> sum( signatureSize, 0.f );
    for( int ti = 0; ti < numTrees; ti++ )
    {
        int leafIdx = getLeafIdx( ti, patch );
        const uchar* post = &quantizedPosteriors[ti*treePostSize + leafIdx*signatureSize];
        for( int ci = 0; ci < signatureSize; ci++ )
            sum[ci] += post[ci];
    }
    float coef = 1.f/numTrees;
    signature.resize( signatureSize );
    uchar* sig = &signature[0];
    for( int ci = 0; ci < signatureSize; ci++ )
    {
        sig[ci] = (uchar)(sum[ci]*coef);
        if( sig[ci] < thresh )
            sig[ci] = 0;
    }
}
#endif

void CalonderClassifier::read( const FileNode& fn )
{
    prepare( fn["patchSize"], fn["signatureSize"], fn["numTrees"], fn["treeDepth"], fn["numViews"] );
    origNumClasses = fn["origNumClasses"];
    compressType = fn["compressType"];
    int _numQuantBits = fn["numQuantBits"];

    for( int ti = 0; ti < numTrees; ti++ )
    {
        stringstream treeName;
        treeName << "tree" << ti;
        FileNode treeFN = fn["trees"][treeName.str()];

        Node* treeNodes = &nodes[ti*numNodesPerTree];
        FileNodeIterator nodesFNIter = treeFN["nodes"].begin();
        for( int ni = 0; ni < numNodesPerTree; ni++ )
        {
            Node* node = treeNodes + ni;
            nodesFNIter >> node->x1 >> node->y1 >> node->x2 >> node->y2;
        }

        FileNode posteriorsFN = treeFN["posteriors"];
        for( int li = 0; li < numLeavesPerTree; li++ )
        {
            stringstream leafName;
            leafName << "leaf" << li;
            float* post = &posteriors[ti*numLeavesPerTree*signatureSize + li*signatureSize];
            FileNodeIterator leafFNIter = posteriorsFN[leafName.str()].begin();
            for( int ci = 0; ci < signatureSize; ci++ )
                leafFNIter >> post[ci];
        }
    }
#if QUANTIZATION_AVAILABLE
    if( _numQuantBits )
        quantizePosteriors(_numQuantBits);
#endif
}

void CalonderClassifier::write( FileStorage& fs ) const
{
    if( !fs.isOpened() )
        return;
    fs << "patchSize" << patchSize;
    fs << "numTrees" << numTrees;
    fs << "treeDepth" << treeDepth;
    fs << "numViews" << numViews;
    fs << "origNumClasses" << origNumClasses;
    fs << "signatureSize" << signatureSize;
    fs << "compressType" << compressType;
    fs << "numQuantBits" << numQuantBits;

    fs << "trees" << "{";
    for( int ti = 0; ti < numTrees; ti++ )
    {
        stringstream treeName;
        treeName << "tree" << ti;
        fs << treeName.str() << "{";

        fs << "nodes" << "[:";
        const Node* treeNodes = &nodes[ti*numNodesPerTree];
        for( int ni = 0; ni < numNodesPerTree; ni++ )
        {
            const Node* node = treeNodes + ni;
            fs << node->x1 << node->y1 << node->x2 << node->y2;
        }
        fs << "]"; // nodes

        fs << "posteriors" << "{";
        for( int li = 0; li < numLeavesPerTree; li++ )
        {
            stringstream leafName;
            leafName << "leaf" << li;
            fs << leafName.str() << "[:";
            const float* post = &posteriors[ti*numLeavesPerTree*signatureSize + li*signatureSize];
            for( int ci = 0; ci < signatureSize; ci++ )
            {
                fs << post[ci];
            }
            fs << "]"; // leaf
        }
        fs << "}"; // posteriors
        fs << "}"; // tree
    }
    fs << "}"; // trees
}

struct RTreeNode
{
    short offset1, offset2;
};

void CalonderClassifier::read( istream &is )
{
    int _patchSize, _numTrees, _treeDepth, _numViews, _signatureSize, _origNumClasses, _numQuantBits, _compressType;

    _patchSize = 32;
    _numViews = 0;
    _compressType = COMPRESS_DISTR_BERNOULLI;
    is.read((char*)(&_numTrees), sizeof(_numTrees));
    is.read((char*)(&_signatureSize), sizeof(_signatureSize));
    is.read((char*)(&_origNumClasses), sizeof(_origNumClasses));
    is.read((char*)(&_numQuantBits), sizeof(_numQuantBits));

    // 1st tree
    int _classes;
    is.read((char*)(&_classes), sizeof(_classes));
    CV_Assert( _signatureSize == _classes );
    is.read((char*)(&_treeDepth), sizeof(_treeDepth));

    prepare( _patchSize, _signatureSize, _numTrees, _treeDepth, _numViews );

    origNumClasses = _origNumClasses;
    compressType = _compressType;

    if( _numQuantBits>8 )
    {
        if( verbose )
            cout << "[WARNING] suspicious value numQuantBits=" << numQuantBits << " found; setting to " << DEFAULT_NUM_QUANT_BITS;
        _numQuantBits = DEFAULT_NUM_QUANT_BITS;
    }

    // 1st tree
    vector<RTreeNode> rtreeNodes(numNodesPerTree);
    is.read((char*)(&rtreeNodes[0]), numNodesPerTree * sizeof(rtreeNodes[0]));
    for( int ni = 0; ni < numNodesPerTree; ni ++ )
    {
        short offset1 = rtreeNodes[ni].offset1,
              offset2 = rtreeNodes[ni].offset2;
        nodes[ni] = Node(offset1 % _patchSize, offset1 / _patchSize, offset2 % _patchSize, offset2 / _patchSize );
    }
    for( int li = 0; li < numLeavesPerTree; li++ )
        is.read((char*)&posteriors[li*signatureSize], signatureSize * sizeof(float));

    // other trees
    for( int treeIdx = 1; treeIdx < numTrees; treeIdx++ )
    {
        is.read((char*)(&_classes), sizeof(_classes));
        CV_Assert( _classes == signatureSize );
        is.read((char*)(&_treeDepth), sizeof(_treeDepth));
        CV_Assert( _treeDepth == treeDepth );

        is.read((char*)(&rtreeNodes[0]), numNodesPerTree * sizeof(rtreeNodes[0]));

        Node* treeNodes = &nodes[treeIdx*numNodesPerTree];
        for( int ni = 0; ni < numNodesPerTree; ni ++ )
        {
            short offset1 = rtreeNodes[ni].offset1,
                  offset2 = rtreeNodes[ni].offset2;
            treeNodes[ni] = Node(offset1 % _patchSize, offset1 / _patchSize, offset2 % _patchSize, offset2 / _patchSize );
        }
        float* treePosteriors = &posteriors[treeIdx*numLeavesPerTree*signatureSize];
        for( int li = 0; li < numLeavesPerTree; li++ )
            is.read((char*)&treePosteriors[li*signatureSize], signatureSize * sizeof(float));

    }

#if QUANTIZATION_AVAILABLE
    if( _numQuantBits )
        quantizePosteriors(_numQuantBits);
#endif
}
#endif

}