Merged the trunk r8595:8668 (except iOS and new gpu functionality)

2012-06-15 13:21:26 +00:00
parent bd0e0b5800
commit f666bd221f
47 changed files with 457 additions and 436 deletions
--- a/modules/contrib/src/rgbdodometry.cpp
+++ b/modules/contrib/src/rgbdodometry.cpp
@@ -393,7 +393,7 @@ bool computeKsi( int transformType,
                 const Mat& image0, const Mat&  cloud0,
                 const Mat& image1, const Mat& dI_dx1, const Mat& dI_dy1,
                 const Mat& corresps, int correspsCount,
-                 double fx, double fy, double sobelScale, double normScale, double determinantThreshold,
+                 double fx, double fy, double sobelScale, double determinantThreshold,
                 Mat& ksi )
 {
    int Cwidth = -1;
@@ -419,6 +419,7 @@ bool computeKsi( int transformType,
    Mat C( correspsCount, Cwidth, CV_64FC1 );
    Mat dI_dt( correspsCount, 1, CV_64FC1 );

+    double sigma = 0;
    int pointCount = 0;
    for( int v0 = 0; v0 < corresps.rows; v0++ )
    {
@@ -428,14 +429,36 @@ bool computeKsi( int transformType,
            {
                int u1, v1;
                get2shorts( corresps.at<int>(v0,u0), u1, v1 );
+                double diff = static_cast<double>(image1.at<uchar>(v1,u1)) -
+                              static_cast<double>(image0.at<uchar>(v0,u0));
+                sigma += diff * diff;
+                pointCount++;
+            }
+        }
+    }
+    sigma = std::sqrt(sigma/pointCount);
+
+    pointCount = 0;
+    for( int v0 = 0; v0 < corresps.rows; v0++ )
+    {
+        for( int u0 = 0; u0 < corresps.cols; u0++ )
+        {
+            if( corresps.at<int>(v0,u0) != -1 )
+            {
+                int u1, v1;
+                get2shorts( corresps.at<int>(v0,u0), u1, v1 );
+
+                double diff = static_cast<double>(image1.at<uchar>(v1,u1)) -
+                              static_cast<double>(image0.at<uchar>(v0,u0));
+                double w = sigma + std::abs(diff);
+                w = w > DBL_EPSILON ? 1./w : 1.;

                (*computeCFuncPtr)( (double*)C.ptr(pointCount),
-                                     normScale * sobelScale * dI_dx1.at<short int>(v1,u1),
-                                     normScale * sobelScale * dI_dy1.at<short int>(v1,u1),
+                                     w * sobelScale * dI_dx1.at<short int>(v1,u1),
+                                     w * sobelScale * dI_dy1.at<short int>(v1,u1),
                                     cloud0.at<Point3f>(v0,u0), fx, fy);

-                dI_dt.at<double>(pointCount) = normScale * (static_cast<double>(image1.at<uchar>(v1,u1)) -
-                                                            static_cast<double>(image0.at<uchar>(v0,u0)));
+                dI_dt.at<double>(pointCount) = w * diff;
                pointCount++;
            }
        }
@@ -556,8 +579,6 @@ bool cv::RGBDOdometry( cv::Mat& Rt, const Mat& initRt,

        const double fx = levelCameraMatrix.at<double>(0,0);
        const double fy = levelCameraMatrix.at<double>(1,1);
-        const double avgf = 0.5 *(fx + fy);
-        const double normScale = 1./(255*avgf);
        const double determinantThreshold = 1e-6;

        Mat corresps( levelImage0.size(), levelImage0.type(), CV_32SC1 );
@@ -576,7 +597,7 @@ bool cv::RGBDOdometry( cv::Mat& Rt, const Mat& initRt,
                                             levelImage0, levelCloud0,
                                             levelImage1, level_dI_dx1, level_dI_dy1,
                                             corresps, correspsCount,
-                                             fx, fy, sobelScale, normScale, determinantThreshold,
+                                             fx, fy, sobelScale, determinantThreshold,
                                             ksi );

            if( !solutionExist )
--- a/modules/flann/include/opencv2/flann/lsh_index.h
+++ b/modules/flann/include/opencv2/flann/lsh_index.h
@@ -90,9 +90,9 @@ public:
             Distance d = Distance()) :
        dataset_(input_data), index_params_(params), distance_(d)
    {
-        table_number_ = get_param<unsigned int>(index_params_,"table_number",12);
-        key_size_ = get_param<unsigned int>(index_params_,"key_size",20);
-        multi_probe_level_ = get_param<unsigned int>(index_params_,"multi_probe_level",2);
+        table_number_ = get_param<int>(index_params_,"table_number",12);
+        key_size_ = get_param<int>(index_params_,"key_size",20);
+        multi_probe_level_ = get_param<int>(index_params_,"multi_probe_level",2);

        feature_size_ = (unsigned)dataset_.cols;
        fill_xor_mask(0, key_size_, multi_probe_level_, xor_masks_);
--- a/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
+++ b/modules/gpu/src/nvidia/NCVBroxOpticalFlow.cu
@@ -940,7 +940,7 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);

            //texture format descriptor
-            cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
+            cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>();

            I0 = *img0Iter;
            I1 = *img1Iter;
@@ -948,8 +948,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            ++img0Iter;
            ++img1Iter;

-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), channel_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);

            //compute derivatives
            dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
@@ -989,20 +989,20 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
            ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
                nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 
      
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy,  Iy.ptr(),  channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), channel_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy,  Iy.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);

            //    flow
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
            //    flow increments
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-            ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+            ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

            dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
            dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
@@ -1032,37 +1032,37 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
                    
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

                prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);

                ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
                
                //    linear system coefficients
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
            
                //solve linear system
                for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
                {
                    float omega = 1.99f;
                
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

                    sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
                        (du_new.ptr(), 
@@ -1079,8 +1079,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);

-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);

                    sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
                        (du.ptr(), 
@@ -1097,8 +1097,8 @@ NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,

                    ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);

-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
-                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), channel_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
+                    ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
                }//end of solver loop
            }// end of inner loop

--- a/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
+++ b/modules/gpu/src/nvidia/NCVHaarObjectDetection.cu
@@ -1622,16 +1622,16 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
            continue;
        }

-        NcvSize32s srcRoi, srcIIRoi, scaledIIRoi, searchRoi;
+        NcvSize32s srcRoi_, srcIIRo_i, scaledIIRoi, searchRoi;

-        srcRoi.width = d_srcImg.width();
-        srcRoi.height = d_srcImg.height();
+        srcRoi_.width = d_srcImg.width();
+        srcRoi_.height = d_srcImg.height();

-        srcIIRoi.width = srcRoi.width + 1;
-        srcIIRoi.height = srcRoi.height + 1;
+        srcIIRo_i.width = srcRoi_.width + 1;
+        srcIIRo_i.height = srcRoi_.height + 1;

-        scaledIIRoi.width = srcIIRoi.width / scale;
-        scaledIIRoi.height = srcIIRoi.height / scale;
+        scaledIIRoi.width = srcIIRo_i.width / scale;
+        scaledIIRoi.height = srcIIRo_i.height / scale;

        searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
        searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
@@ -1659,12 +1659,12 @@ NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
    {
        Ncv32u scale = scalesVector[i];

-        NcvSize32u srcRoi, scaledIIRoi, searchRoi;
+        NcvSize32u srcRoi_, scaledIIRoi, searchRoi;
        NcvSize32u srcIIRoi;
-        srcRoi.width = d_srcImg.width();
-        srcRoi.height = d_srcImg.height();
-        srcIIRoi.width = srcRoi.width + 1;
-        srcIIRoi.height = srcRoi.height + 1;
+        srcRoi_.width = d_srcImg.width();
+        srcRoi_.height = d_srcImg.height();
+        srcIIRoi.width = srcRoi_.width + 1;
+        srcIIRoi.height = srcRoi_.height + 1;
        scaledIIRoi.width = srcIIRoi.width / scale;
        scaledIIRoi.height = srcIIRoi.height / scale;
        searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
--- a/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
+++ b/modules/gpu/src/nvidia/NPP_staging/NPP_staging.cu
@@ -1414,17 +1414,17 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
        //calculate hierarchical partial sums
        for (Ncv32u i=1; i<partSumNums.size()-1; i++)
        {
-            dim3 grid(partSumNums[i+1]);
-            if (grid.x > 65535)
+            dim3 grid_partial(partSumNums[i+1]);
+            if (grid_partial.x > 65535)
            {
-                grid.y = (grid.x + 65534) / 65535;
-                grid.x = 65535;
+                grid_partial.y = (grid_partial.x + 65534) / 65535;
+                grid_partial.x = 65535;
            }
-            if (grid.x != 1)
+            if (grid_partial.x != 1)
            {
                removePass1Scan
                    <false, true>
-                    <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+                    <<<grid_partial, block, 0, nppStGetActiveCUDAstream()>>>
                    (d_hierSums.ptr() + partSumOffsets[i],
                     partSumNums[i], NULL,
                     d_hierSums.ptr() + partSumOffsets[i+1],
@@ -1434,7 +1434,7 @@ NCVStatus compactVector_32u_device(Ncv32u *d_src, Ncv32u srcLen,
            {
                removePass1Scan
                    <false, false>
-                    <<<grid, block, 0, nppStGetActiveCUDAstream()>>>
+                    <<<grid_partial, block, 0, nppStGetActiveCUDAstream()>>>
                    (d_hierSums.ptr() + partSumOffsets[i],
                     partSumNums[i], NULL,
                     NULL,
--- a/modules/gpu/src/nvidia/core/NCV.cu
+++ b/modules/gpu/src/nvidia/core/NCV.cu
@@ -723,16 +723,16 @@ static NCVStatus drawRectsWrapperHost(T *h_dst,

        if (rect.x < dstWidth)
        {
-            for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
+            for (Ncv32u each=rect.y; each<rect.y+rect.height && each<dstHeight; each++)
            {
-                h_dst[i*dstStride+rect.x] = color;
+                h_dst[each*dstStride+rect.x] = color;
            }
        }
        if (rect.x+rect.width-1 < dstWidth)
        {
-            for (Ncv32u i=rect.y; i<rect.y+rect.height && i<dstHeight; i++)
+            for (Ncv32u each=rect.y; each<rect.y+rect.height && each<dstHeight; each++)
            {
-                h_dst[i*dstStride+rect.x+rect.width-1] = color;
+                h_dst[each*dstStride+rect.x+rect.width-1] = color;
            }
        }
        if (rect.y < dstHeight)
--- a/modules/gpu/src/nvidia/core/NCV.hpp
+++ b/modules/gpu/src/nvidia/core/NCV.hpp
@@ -623,11 +623,11 @@ class NCVVectorAlloc : public NCVVector<T>
 {
    NCVVectorAlloc();
    NCVVectorAlloc(const NCVVectorAlloc &);
-    NCVVectorAlloc& operator=(const NCVVectorAlloc<T>&);	
+    NCVVectorAlloc& operator=(const NCVVectorAlloc<T>&);

 public:

-    NCVVectorAlloc(INCVMemAllocator &allocator_, Ncv32u length)
+    NCVVectorAlloc(INCVMemAllocator &allocator_, Ncv32u length_)
        :
        allocator(allocator_)
    {
@@ -636,11 +636,11 @@ public:
        this->clear();
        this->allocatedMem.clear();

-        ncvStat = allocator.alloc(this->allocatedMem, length * sizeof(T));
+        ncvStat = allocator.alloc(this->allocatedMem, length_ * sizeof(T));
        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVVectorAlloc ctor:: alloc failed", );

        this->_ptr = (T *)this->allocatedMem.begin.ptr;
-        this->_length = length;
+        this->_length = length_;
        this->_memtype = this->allocatedMem.begin.memtype;
    }

@@ -698,15 +698,15 @@ public:
        this->bReused = true;
    }

-    NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length)
+    NCVVectorReuse(const NCVMemSegment &memSegment, Ncv32u length_)
    {
        this->bReused = false;
        this->clear();

-        ncvAssertPrintReturn(length * sizeof(T) <= memSegment.size, \
+        ncvAssertPrintReturn(length_ * sizeof(T) <= memSegment.size, \
            "NCVVectorReuse ctor:: memory binding failed due to size mismatch", );

-        this->_length = length;
+        this->_length = length_;
        this->_ptr = (T *)memSegment.begin.ptr;
        this->_memtype = memSegment.begin.memtype;

@@ -841,34 +841,34 @@ class NCVMatrixAlloc : public NCVMatrix<T>
    NCVMatrixAlloc& operator=(const NCVMatrixAlloc &);
 public:

-    NCVMatrixAlloc(INCVMemAllocator &allocator, Ncv32u width, Ncv32u height, Ncv32u _pitch=0)
+    NCVMatrixAlloc(INCVMemAllocator &allocator_, Ncv32u width_, Ncv32u height_, Ncv32u pitch_=0)
        :
-        allocator(allocator)
+        allocator(allocator_)
    {
        NCVStatus ncvStat;

        this->clear();
        this->allocatedMem.clear();

-        Ncv32u widthBytes = width * sizeof(T);
+        Ncv32u widthBytes = width_ * sizeof(T);
        Ncv32u pitchBytes = alignUp(widthBytes, allocator.alignment());

-        if (_pitch != 0)
+        if (pitch_ != 0)
        {
-            ncvAssertPrintReturn(_pitch >= pitchBytes &&
-                (_pitch & (allocator.alignment() - 1)) == 0,
+            ncvAssertPrintReturn(pitch_ >= pitchBytes &&
+                (pitch_ & (allocator.alignment() - 1)) == 0,
                "NCVMatrixAlloc ctor:: incorrect pitch passed", );
-            pitchBytes = _pitch;
+            pitchBytes = pitch_;
        }

-        Ncv32u requiredAllocSize = pitchBytes * height;
+        Ncv32u requiredAllocSize = pitchBytes * height_;

        ncvStat = allocator.alloc(this->allocatedMem, requiredAllocSize);
        ncvAssertPrintReturn(ncvStat == NCV_SUCCESS, "NCVMatrixAlloc ctor:: alloc failed", );

        this->_ptr = (T *)this->allocatedMem.begin.ptr;
-        this->_width = width;
-        this->_height = height;
+        this->_width = width_;
+        this->_height = height_;
        this->_pitch = pitchBytes;
        this->_memtype = this->allocatedMem.begin.memtype;
    }
@@ -916,34 +916,34 @@ class NCVMatrixReuse : public NCVMatrix<T>

 public:

-    NCVMatrixReuse(const NCVMemSegment &memSegment, Ncv32u alignment, Ncv32u width, Ncv32u height, Ncv32u pitch=0, NcvBool bSkipPitchCheck=false)
+    NCVMatrixReuse(const NCVMemSegment &memSegment, Ncv32u alignment, Ncv32u width_, Ncv32u height_, Ncv32u pitch_=0, NcvBool bSkipPitchCheck=false)
    {
        this->bReused = false;
        this->clear();

-        Ncv32u widthBytes = width * sizeof(T);
+        Ncv32u widthBytes = width_ * sizeof(T);
        Ncv32u pitchBytes = alignUp(widthBytes, alignment);

-        if (pitch != 0)
+        if (pitch_ != 0)
        {
            if (!bSkipPitchCheck)
            {
-                ncvAssertPrintReturn(pitch >= pitchBytes &&
-                    (pitch & (alignment - 1)) == 0,
+                ncvAssertPrintReturn(pitch_ >= pitchBytes &&
+                    (pitch_ & (alignment - 1)) == 0,
                    "NCVMatrixReuse ctor:: incorrect pitch passed", );
            }
            else
            {
-                ncvAssertPrintReturn(pitch >= widthBytes, "NCVMatrixReuse ctor:: incorrect pitch passed", );
+                ncvAssertPrintReturn(pitch_ >= widthBytes, "NCVMatrixReuse ctor:: incorrect pitch passed", );
            }
-            pitchBytes = pitch;
+            pitchBytes = pitch_;
        }

-        ncvAssertPrintReturn(pitchBytes * height <= memSegment.size, \
+        ncvAssertPrintReturn(pitchBytes * height_ <= memSegment.size, \
            "NCVMatrixReuse ctor:: memory binding failed due to size mismatch", );

-        this->_width = width;
-        this->_height = height;
+        this->_width = width_;
+        this->_height = height_;
        this->_pitch = pitchBytes;
        this->_ptr = (T *)memSegment.begin.ptr;
        this->_memtype = memSegment.begin.memtype;
--- a/modules/highgui/CMakeLists.txt
+++ b/modules/highgui/CMakeLists.txt
@@ -188,7 +188,7 @@ elseif(APPLE)
    list(APPEND HIGHGUI_LIBRARIES "-framework Carbon" "-framework QuickTime" "-framework CoreFoundation" "-framework QuartzCore")
  else()
    list(APPEND highgui_srcs src/cap_qtkit.mm)
-    list(APPEND HIGHGUI_LIBRARIES "-framework QTKit" "-framework QuartzCore")
+    list(APPEND HIGHGUI_LIBRARIES "-framework QTKit" "-framework QuartzCore" "-framework AppKit")
  endif()
 endif()

--- a/modules/highgui/src/cap_dshow.cpp
+++ b/modules/highgui/src/cap_dshow.cpp
@@ -445,12 +445,12 @@ class videoDevice{
        int  nFramesForReconnect;
        unsigned long nFramesRunning;
        int  connection;
-        int     storeConn;
+        int  storeConn;
        int  myID;
        long requestedFrameTime; //ie fps

-        char     nDeviceName[255];
-        WCHAR     wDeviceName[255];
+        char  nDeviceName[255];
+        WCHAR wDeviceName[255];

        unsigned char * pixels;
        char * pBuffer;
@@ -643,7 +643,7 @@ public:

        bufferSetup         = false;
        newFrame            = false;
-        latestBufferLength     = 0;
+        latestBufferLength  = 0;

        hEvent = CreateEvent(NULL, true, false, NULL);
    }
@@ -655,7 +655,7 @@ public:
        DeleteCriticalSection(&critSection);
        CloseHandle(hEvent);
        if(bufferSetup){
-            delete pixels;
+            delete[] pixels;
        }
    }

@@ -665,11 +665,11 @@ public:
        if(bufferSetup){
            return false;
        }else{
-            numBytes             = numBytesIn;
-            pixels                 = new unsigned char[numBytes];
+            numBytes            = numBytesIn;
+            pixels              = new unsigned char[numBytes];
            bufferSetup         = true;
            newFrame            = false;
-            latestBufferLength     = 0;
+            latestBufferLength  = 0;
        }
        return true;
    }
@@ -796,12 +796,12 @@ void videoDevice::setSize(int w, int h){
    }
    else
    {
-        width                 = w;
-        height                 = h;
-        videoSize             = w*h*3;
+        width               = w;
+        height              = h;
+        videoSize           = w*h*3;
        sizeSet             = true;
-        pixels                = new unsigned char[videoSize];
-        pBuffer                = new char[videoSize];
+        pixels              = new unsigned char[videoSize];
+        pBuffer             = new char[videoSize];

        memset(pixels, 0 , videoSize);
        sgCallback->setupBuffer(videoSize);
--- a/modules/imgproc/doc/miscellaneous_transformations.rst
+++ b/modules/imgproc/doc/miscellaneous_transformations.rst
@@ -657,7 +657,7 @@ Applies a fixed-level threshold to each array element.

 .. ocv:pyoldfunction:: cv.Threshold(src, dst, threshold, maxValue, thresholdType)-> None

-    :param src: Source array (single-channel, 8-bit of 32-bit floating point).
+    :param src: Source array (single-channel, 8-bit or 32-bit floating point).

    :param dst: Destination array of the same size and type as  ``src`` .

--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
@@ -2839,6 +2839,11 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
    CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
    M0.convertTo(matM, matM.type());

+#ifdef HAVE_TEGRA_OPTIMIZATION
+    if( tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
+        return;
+#endif
+
    if( !(flags & WARP_INVERSE_MAP) )
    {
        double D = M[0]*M[4] - M[1]*M[3];
@@ -2851,22 +2856,6 @@ void cv::warpAffine( InputArray _src, OutputArray _dst,
        M[2] = b1; M[5] = b2;
    }

-#ifdef HAVE_TEGRA_OPTIMIZATION
-    if (borderType == BORDER_REPLICATE)
-    {
-        if( tegra::warpAffine(src, dst, M, interpolation, borderType, borderValue) )
-        return;
-    }
-    else
-    {
-        double warp_mat[6];
-        Mat warp_m(2, 3, CV_64F, warp_mat);
-        M0.convertTo(warp_m, warp_m.type());
-        if( tegra::warpAffine(src, dst, warp_mat, interpolation, borderType, borderValue) )
-        return;
-    }
-#endif
-
    int x, y, x1, y1, width = dst.cols, height = dst.rows;
    AutoBuffer<int> _abdelta(width*2);
    int* adelta = &_abdelta[0], *bdelta = adelta + width;
@@ -2995,14 +2984,14 @@ void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
    CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
    M0.convertTo(matM, matM.type());

-    if( !(flags & WARP_INVERSE_MAP) )
-         invert(matM, matM);
-
 #ifdef HAVE_TEGRA_OPTIMIZATION
-    if( tegra::warpPerspective(src, dst, M, interpolation, borderType, borderValue) )
+    if( tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
        return;
 #endif

+    if( !(flags & WARP_INVERSE_MAP) )
+         invert(matM, matM);
+
    int x, y, x1, y1, width = dst.cols, height = dst.rows;

    int bh0 = std::min(BLOCK_SZ/2, height);
--- a/modules/imgproc/src/thresh.cpp
+++ b/modules/imgproc/src/thresh.cpp
@@ -60,26 +60,10 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
    }

 #ifdef HAVE_TEGRA_OPTIMIZATION
-    switch( type )
-    {
-    case THRESH_BINARY:
-        if(tegra::thresh_8u_binary(_src, _dst, roi.width, roi.height, thresh, maxval)) return;
-        break;
-    case THRESH_BINARY_INV:
-        if(tegra::thresh_8u_binary_inv(_src, _dst, roi.width, roi.height, thresh, maxval)) return;
-        break;
-    case THRESH_TRUNC:
-        if(tegra::thresh_8u_trunc(_src, _dst, roi.width, roi.height, thresh)) return;
-        break;
-    case THRESH_TOZERO:
-        if(tegra::thresh_8u_tozero(_src, _dst, roi.width, roi.height, thresh)) return;
-        break;
-    case THRESH_TOZERO_INV:
-        if(tegra::thresh_8u_tozero_inv(_src, _dst, roi.width, roi.height, thresh)) return;
-        break;
-    }
+    if (tegra::thresh_8u(_src, _dst, roi.width, roi.height, thresh, maxval, type))
+        return;
 #endif
-  
+
    switch( type )
    {
    case THRESH_BINARY:
@@ -124,7 +108,7 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
        __m128i thresh_s = _mm_set1_epi8(thresh ^ 0x80);
        __m128i maxval_ = _mm_set1_epi8(maxval);
        j_scalar = roi.width & -8;
-        
+
        for( i = 0; i < roi.height; i++ )
        {
            const uchar* src = (const uchar*)(_src.data + _src.step*i);
@@ -240,7 +224,7 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
            }
        }
    }
-#endif        
+#endif

    if( j_scalar < roi.width )
    {
@@ -248,8 +232,8 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
        {
            const uchar* src = (const uchar*)(_src.data + _src.step*i);
            uchar* dst = (uchar*)(_dst.data + _dst.step*i);
-			j = j_scalar;
-            #if CV_ENABLE_UNROLLED
+            j = j_scalar;
+#if CV_ENABLE_UNROLLED
            for( ; j <= roi.width - 4; j += 4 )
            {
                uchar t0 = tab[src[j]];
@@ -264,7 +248,7 @@ thresh_8u( const Mat& _src, Mat& _dst, uchar thresh, uchar maxval, int type )
                dst[j+2] = t0;
                dst[j+3] = t1;
            }
-            #endif
+#endif
            for( ; j < roi.width; j++ )
                dst[j] = tab[src[j]];
        }
@@ -282,7 +266,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
    short* dst = (short*)_dst.data;
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);
-    
+
 #if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
 #endif
@@ -293,6 +277,11 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
        roi.height = 1;
    }

+#ifdef HAVE_TEGRA_OPTIMIZATION
+    if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type))
+        return;
+#endif
+
    switch( type )
    {
    case THRESH_BINARY:
@@ -344,8 +333,8 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
                    _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                }
            }
-        #endif            
-            
+        #endif
+
            for( ; j < roi.width; j++ )
                dst[j] = src[j] <= thresh ? maxval : 0;
        }
@@ -370,8 +359,8 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
                    _mm_storeu_si128((__m128i*)(dst + j + 8), v1 );
                }
            }
-        #endif            
-            
+        #endif
+
            for( ; j < roi.width; j++ )
                dst[j] = std::min(src[j], thresh);
        }
@@ -397,7 +386,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
                }
            }
        #endif
-            
+
            for( ; j < roi.width; j++ )
            {
                short v = src[j];
@@ -438,7 +427,7 @@ thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type )
    }
 }

-    
+
 static void
 thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
 {
@@ -449,17 +438,22 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
    float* dst = (float*)_dst.data;
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);
-    
+
 #if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
 #endif
-    
+
    if( _src.isContinuous() && _dst.isContinuous() )
    {
        roi.width *= roi.height;
        roi.height = 1;
    }
-    
+
+#ifdef HAVE_TEGRA_OPTIMIZATION
+    if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
+        return;
+#endif
+
    switch( type )
    {
        case THRESH_BINARY:
@@ -484,12 +478,12 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                    }
                }
 #endif
-                
+
                for( ; j < roi.width; j++ )
                    dst[j] = src[j] > thresh ? maxval : 0;
            }
            break;
-            
+
        case THRESH_BINARY_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
@@ -511,13 +505,13 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
-#endif            
-                
+#endif
+
                for( ; j < roi.width; j++ )
                    dst[j] = src[j] <= thresh ? maxval : 0;
            }
            break;
-            
+
        case THRESH_TRUNC:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
@@ -537,13 +531,13 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
-#endif            
-                
+#endif
+
                for( ; j < roi.width; j++ )
                    dst[j] = std::min(src[j], thresh);
            }
            break;
-            
+
        case THRESH_TOZERO:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
@@ -564,7 +558,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                    }
                }
 #endif
-                
+
                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
@@ -572,7 +566,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
                }
            }
            break;
-            
+
        case THRESH_TOZERO_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
@@ -604,7 +598,7 @@ thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
            return CV_Error( CV_StsBadArg, "" );
    }
 }
-    
+

 static double
 getThreshVal_Otsu_8u( const Mat& _src )
@@ -620,8 +614,8 @@ getThreshVal_Otsu_8u( const Mat& _src )
    for( i = 0; i < size.height; i++ )
    {
        const uchar* src = _src.data + _src.step*i;
-		j = 0;
-		#if CV_ENABLE_UNROLLED
+        j = 0;
+        #if CV_ENABLE_UNROLLED
        for( ; j <= size.width - 4; j += 4 )
        {
            int v0 = src[j], v1 = src[j+1];
@@ -637,7 +631,7 @@ getThreshVal_Otsu_8u( const Mat& _src )
    double mu = 0, scale = 1./(size.width*size.height);
    for( i = 0; i < N; i++ )
        mu += i*(double)h[i];
-    
+
    mu *= scale;
    double mu1 = 0, q1 = 0;
    double max_sigma = 0, max_val = 0;
@@ -719,7 +713,7 @@ private:
 };

 }
-    
+
 double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double maxval, int type )
 {
    Mat src = _src.getMat();
@@ -731,12 +725,12 @@ double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double m
        CV_Assert( src.type() == CV_8UC1 );
        thresh = getThreshVal_Otsu_8u(src);
    }
-  
+
    _dst.create( src.size(), src.type() );
    Mat dst = _dst.getMat();

    int nStripes = 1;
-#if defined HAVE_TBB && defined HAVE_TEGRA_OPTIMIZATION
+#if defined HAVE_TBB && defined ANDROID
    nStripes = 4;
 #endif

@@ -765,7 +759,6 @@ double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double m
        }
        else
        {
-            //thresh_8u( src, dst, (uchar)ithresh, (uchar)imaxval, type );
            parallel_for(BlockedRange(0, nStripes),
                         ThresholdRunner(src, dst, nStripes, (uchar)ithresh, (uchar)imaxval, type));
        }
@@ -778,7 +771,7 @@ double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double m
        if( type == THRESH_TRUNC )
            imaxval = ithresh;
        imaxval = saturate_cast<short>(imaxval);
-        
+
        if( ithresh < SHRT_MIN || ithresh >= SHRT_MAX )
        {
            if( type == THRESH_BINARY || type == THRESH_BINARY_INV ||
@@ -795,14 +788,12 @@ double cv::threshold( InputArray _src, OutputArray _dst, double thresh, double m
        }
        else
        {
-            //thresh_16s( src, dst, (short)ithresh, (short)imaxval, type );
            parallel_for(BlockedRange(0, nStripes),
                         ThresholdRunner(src, dst, nStripes, (short)ithresh, (short)imaxval, type));
        }
    }
    else if( src.depth() == CV_32F )
    {
-        //thresh_32f( src, dst, (float)thresh, (float)maxval, type );
        parallel_for(BlockedRange(0, nStripes),
                     ThresholdRunner(src, dst, nStripes, (float)thresh, (float)maxval, type));
    }
@@ -829,7 +820,7 @@ void cv::adaptiveThreshold( InputArray _src, OutputArray _dst, double maxValue,
        dst = Scalar(0);
        return;
    }
-    
+
    Mat mean;

    if( src.data != dst.data )
@@ -846,7 +837,7 @@ void cv::adaptiveThreshold( InputArray _src, OutputArray _dst, double maxValue,
    int i, j;
    uchar imaxval = saturate_cast<uchar>(maxValue);
    int idelta = type == THRESH_BINARY ? cvCeil(delta) : cvFloor(delta);
-    uchar tab[768];    
+    uchar tab[768];

    if( type == CV_THRESH_BINARY )
        for( i = 0; i < 768; i++ )
--- a/modules/ml/include/opencv2/ml/ml.hpp
+++ b/modules/ml/include/opencv2/ml/ml.hpp
@@ -488,7 +488,7 @@ public:
        bool balanced=false );

    virtual float predict( const CvMat* sample, bool returnDFVal=false ) const;
-    virtual float predict( const CvMat* samples, CvMat* results ) const;
+    virtual float predict( const CvMat* samples, CV_OUT CvMat* results ) const;

 #ifndef SWIG
    CV_WRAP CvSVM( const cv::Mat& trainData, const cv::Mat& responses,
@@ -510,6 +510,7 @@ public:
                            CvParamGrid degreeGrid = CvSVM::get_default_grid(CvSVM::DEGREE),
                            bool balanced=false);
    CV_WRAP virtual float predict( const cv::Mat& sample, bool returnDFVal=false ) const;
+    CV_WRAP_AS(predict_all) void predict( cv::InputArray samples, cv::OutputArray results ) const;
 #endif

    CV_WRAP virtual int get_support_vector_count() const;
--- a/modules/ml/src/boost.cpp
+++ b/modules/ml/src/boost.cpp
@@ -1250,7 +1250,7 @@ CvBoost::update_weights( CvBoostTree* tree )
        if( have_subsample )
        {
            float* values = (float*)cur_buf_pos;
-            cur_buf_pos = (uchar*)(values + data->buf->step);
+            cur_buf_pos = (uchar*)(values + data->buf->cols);
            uchar* missing = cur_buf_pos;
            cur_buf_pos = missing + data->buf->step;
            CvMat _sample, _mask;
--- a/modules/ml/src/svm.cpp
+++ b/modules/ml/src/svm.cpp
@@ -2124,6 +2124,12 @@ float CvSVM::predict(const CvMat* samples, CV_OUT CvMat* results) const
    return result;
 }

+void CvSVM::predict( cv::InputArray _samples, cv::OutputArray _results ) const
+{
+    _results.create(_samples.size().height, 1, CV_32F);
+    CvMat samples = _samples.getMat(), results = _results.getMat();
+    predict(&samples, &results);
+}

 CvSVM::CvSVM( const Mat& _train_data, const Mat& _responses,
              const Mat& _var_idx, const Mat& _sample_idx, CvSVMParams _params )
--- a/modules/python/CMakeLists.txt
+++ b/modules/python/CMakeLists.txt
@@ -60,7 +60,7 @@ add_custom_command(
   DEPENDS ${opencv_hdrs})

 add_library(${the_module} SHARED src2/cv2.cpp ${CMAKE_CURRENT_BINARY_DIR}/generated0.i ${cv2_generated_hdrs} src2/cv2.cv.hpp)
-if(PYTHON_DEBUG_LIBRARIES)
+if(PYTHON_DEBUG_LIBRARIES AND NOT PYTHON_LIBRARIES MATCHES "optimized.*debug")
  target_link_libraries(${the_module} debug ${PYTHON_DEBUG_LIBRARIES} optimized ${PYTHON_LIBRARIES})
 else()
  target_link_libraries(${the_module} ${PYTHON_LIBRARIES})