Merge branch 'master' of https://github.com/Itseez/opencv

Added python binding for createButton
2016-06-20 16:24:23 -05:00
parent 9b959072a2 4597b94099
commit e8bfb48490
36 changed files with 1665 additions and 561 deletions
--- a/modules/calib3d/include/opencv2/calib3d.hpp
+++ b/modules/calib3d/include/opencv2/calib3d.hpp
@@ -767,6 +767,14 @@ k-th translation vector (see the next output parameter description) brings the c
 from the model coordinate space (in which object points are specified) to the world coordinate
 space, that is, a real position of the calibration pattern in the k-th pattern view (k=0.. *M* -1).
@param tvecs Output vector of translation vectors estimated for each pattern view.
+@param stdDeviationsIntrinsics Output vector of standard deviations estimated for intrinsic parameters.
+ Order of deviations values:
+\f$(f_x, f_y, c_x, c_y, k_1, k_2, p_1, p_2, k_3, k_4, k_5, k_6 , s_1, s_2, s_3,
+ s_4, \tau_x, \tau_y)\f$ If one of parameters is not estimated, it's deviation is equals to zero.
+@param stdDeviationsExtrinsics Output vector of standard deviations estimated for extrinsic parameters.
+ Order of deviations values: \f$(R_1, T_1, \dotsc , R_M, T_M)\f$ where M is number of pattern views,
+ \f$R_i, T_i\f$ are concatenated 1x3 vectors.
+ @param perViewErrors Output vector of average re-projection errors estimated for each pattern view.
@param flags Different flags that may be zero or a combination of the following values:
 -   **CV_CALIB_USE_INTRINSIC_GUESS** cameraMatrix contains valid initial values of
 fx, fy, cx, cy that are optimized further. Otherwise, (cx, cy) is initially set to the image
@@ -841,6 +849,24 @@ The function returns the final re-projection error.
@sa
   findChessboardCorners, solvePnP, initCameraMatrix2D, stereoCalibrate, undistort
 */
+CV_EXPORTS_AS(calibrateCameraExtended) double calibrateCamera( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints, Size imageSize,
+                                     InputOutputArray cameraMatrix, InputOutputArray distCoeffs,
+                                     OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
+                                     OutputArray stdDeviationsIntrinsics,
+                                     OutputArray stdDeviationsExtrinsics,
+                                     OutputArray perViewErrors,
+                                     int flags = 0, TermCriteria criteria = TermCriteria(
+                                        TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) );
+
+/** @overload double calibrateCamera( InputArrayOfArrays objectPoints,
+                                     InputArrayOfArrays imagePoints, Size imageSize,
+                                     InputOutputArray cameraMatrix, InputOutputArray distCoeffs,
+                                     OutputArrayOfArrays rvecs, OutputArrayOfArrays tvecs,
+                                     OutputArray stdDeviations, OutputArray perViewErrors,
+                                     int flags = 0, TermCriteria criteria = TermCriteria(
+                                        TermCriteria::COUNT + TermCriteria::EPS, 30, DBL_EPSILON) )
+ */
 CV_EXPORTS_W double calibrateCamera( InputArrayOfArrays objectPoints,
                                     InputArrayOfArrays imagePoints, Size imageSize,
                                     InputOutputArray cameraMatrix, InputOutputArray distCoeffs,
--- a/modules/calib3d/include/opencv2/calib3d/calib3d_c.h
+++ b/modules/calib3d/include/opencv2/calib3d/calib3d_c.h
@@ -246,6 +246,7 @@ CVAPI(void) cvDrawChessboardCorners( CvArr* image, CvSize pattern_size,
 #define CV_CALIB_TILTED_MODEL  262144
 #define CV_CALIB_FIX_TAUX_TAUY  524288

+#define CV_CALIB_NINTRINSIC 18

 /* Finds intrinsic and extrinsic camera parameters
   from a few views of known calibration pattern */
--- a/modules/calib3d/src/calibration.cpp
+++ b/modules/calib3d/src/calibration.cpp
@@ -1181,7 +1181,6 @@ CV_IMPL void cvFindExtrinsicCameraParams2( const CvMat* objectPoints,
    cvConvert( &_t, tvec );
 }

-
 CV_IMPL void cvInitIntrinsicParams2D( const CvMat* objectPoints,
                         const CvMat* imagePoints, const CvMat* npoints,
                         CvSize imageSize, CvMat* cameraMatrix,
@@ -1270,15 +1269,37 @@ CV_IMPL void cvInitIntrinsicParams2D( const CvMat* objectPoints,
    cvConvert( &_a, cameraMatrix );
 }

+static void subMatrix(const cv::Mat& src, cv::Mat& dst, const std::vector<uchar>& cols,
+                      const std::vector<uchar>& rows) {
+    int nonzeros_cols = cv::countNonZero(cols);
+    cv::Mat tmp(src.rows, nonzeros_cols, CV_64FC1);

-/* finds intrinsic and extrinsic camera parameters
-   from a few views of known calibration pattern */
-CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,
+    for (int i = 0, j = 0; i < (int)cols.size(); i++)
+    {
+        if (cols[i])
+        {
+            src.col(i).copyTo(tmp.col(j++));
+        }
+    }
+
+    int nonzeros_rows  = cv::countNonZero(rows);
+    dst.create(nonzeros_rows, nonzeros_cols, CV_64FC1);
+    for (int i = 0, j = 0; i < (int)rows.size(); i++)
+    {
+        if (rows[i])
+        {
+            tmp.row(i).copyTo(dst.row(j++));
+        }
+    }
+}
+
+static double cvCalibrateCamera2Internal( const CvMat* objectPoints,
                    const CvMat* imagePoints, const CvMat* npoints,
                    CvSize imageSize, CvMat* cameraMatrix, CvMat* distCoeffs,
-                    CvMat* rvecs, CvMat* tvecs, int flags, CvTermCriteria termCrit )
+                    CvMat* rvecs, CvMat* tvecs, CvMat* stdDevs,
+                    CvMat* perViewErrors, int flags, CvTermCriteria termCrit )
 {
-    const int NINTRINSIC = 18;
+    const int NINTRINSIC = CV_CALIB_NINTRINSIC;
    double reprojErr = 0;

    Matx33d A;
@@ -1338,6 +1359,20 @@ CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,
                "1xn or nx1 array or 1-channel nx3 array, where n is the number of views" );
    }

+    if( stdDevs )
+    {
+        cn = CV_MAT_CN(stdDevs->type);
+        if( !CV_IS_MAT(stdDevs) ||
+            (CV_MAT_DEPTH(stdDevs->type) != CV_32F && CV_MAT_DEPTH(stdDevs->type) != CV_64F) ||
+            ((stdDevs->rows != (nimages*6 + NINTRINSIC) || stdDevs->cols*cn != 1) &&
+            (stdDevs->rows != 1 || stdDevs->cols != (nimages*6 + NINTRINSIC) || cn != 1)) )
+#define STR__(x) #x
+#define STR_(x) STR__(x)
+            CV_Error( CV_StsBadArg, "the output array of standard deviations vectors must be 1-channel "
+                "1x(n*6 + NINTRINSIC) or (n*6 + NINTRINSIC)x1 array, where n is the number of views,"
+                " NINTRINSIC = " STR_(CV_CALIB_NINTRINSIC));
+    }
+
    if( (CV_MAT_TYPE(cameraMatrix->type) != CV_32FC1 &&
        CV_MAT_TYPE(cameraMatrix->type) != CV_64FC1) ||
        cameraMatrix->rows != 3 || cameraMatrix->cols != 3 )
@@ -1367,6 +1402,7 @@ CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,

    Mat matM( 1, total, CV_64FC3 );
    Mat _m( 1, total, CV_64FC2 );
+    Mat allErrors(1, total, CV_64FC2);

    if(CV_MAT_CN(objectPoints->type) == 3) {
        cvarrToMat(objectPoints).convertTo(matM, CV_64F);
@@ -1518,6 +1554,7 @@ CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,
        double* _errNorm = 0;
        bool proceed = solver.updateAlt( _param, _JtJ, _JtErr, _errNorm );
        double *param = solver.param->data.db, *pparam = solver.prevParam->data.db;
+        bool calcJ = solver.state == CvLevMarq::CALC_J || (!proceed && stdDevs);

        if( flags & CALIB_FIX_ASPECT_RATIO )
        {
@@ -1528,8 +1565,10 @@ CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,
        A(0, 0) = param[0]; A(1, 1) = param[1]; A(0, 2) = param[2]; A(1, 2) = param[3];
        std::copy(param + 4, param + 4 + 14, k);

-        if( !proceed )
+        if ( !proceed && !stdDevs && !perViewErrors )
            break;
+        else if ( !proceed && stdDevs )
+            cvZero(_JtJ);

        reprojErr = 0;

@@ -1543,6 +1582,7 @@ CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,

            CvMat _Mi(matM.colRange(pos, pos + ni));
            CvMat _mi(_m.colRange(pos, pos + ni));
+            CvMat _me(allErrors.colRange(pos, pos + ni));

            _Je.resize(ni*2); _Ji.resize(ni*2); _err.resize(ni*2);
            CvMat _dpdr(_Je.colRange(0, 3));
@@ -1552,7 +1592,7 @@ CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,
            CvMat _dpdk(_Ji.colRange(4, NINTRINSIC));
            CvMat _mp(_err.reshape(2, 1));

-            if( solver.state == CvLevMarq::CALC_J )
+            if( calcJ )
            {
                 cvProjectPoints2( &_Mi, &_ri, &_ti, &matA, &_k, &_mp, &_dpdr, &_dpdt,
                                  (flags & CALIB_FIX_FOCAL_LENGTH) ? 0 : &_dpdf,
@@ -1563,8 +1603,10 @@ CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,
                cvProjectPoints2( &_Mi, &_ri, &_ti, &matA, &_k, &_mp );

            cvSub( &_mp, &_mi, &_mp );
+            if (perViewErrors || stdDevs)
+                cvCopy(&_mp, &_me);

-            if( solver.state == CvLevMarq::CALC_J )
+            if( calcJ )
            {
                Mat JtJ(cvarrToMat(_JtJ)), JtErr(cvarrToMat(_JtErr));

@@ -1581,15 +1623,52 @@ CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,
        }
        if( _errNorm )
            *_errNorm = reprojErr;
+
+        if( !proceed )
+        {
+            if( stdDevs )
+            {
+                Mat mask = cvarrToMat(solver.mask);
+                int nparams_nz = countNonZero(mask);
+                Mat JtJinv, JtJN;
+                JtJN.create(nparams_nz, nparams_nz, CV_64F);
+                subMatrix(cvarrToMat(_JtJ), JtJN, mask, mask);
+                completeSymm(JtJN, false);
+                cv::invert(JtJN, JtJinv, DECOMP_SVD);
+                //sigma2 is deviation of the noise
+                //see any papers about variance of the least squares estimator for
+                //detailed description of the variance estimation methods
+                double sigma2 = norm(allErrors, NORM_L2SQR) / (total - nparams_nz);
+                Mat stdDevsM = cvarrToMat(stdDevs);
+                int j = 0;
+                for ( int s = 0; s < nparams; s++ )
+                    if( mask.data[s] )
+                    {
+                        stdDevsM.at<double>(s) = std::sqrt(JtJinv.at<double>(j,j) * sigma2);
+                        j++;
+                    }
+                    else
+                        stdDevsM.at<double>(s) = 0.;
+            }
+            break;
+        }
    }

    // 4. store the results
    cvConvert( &matA, cameraMatrix );
    cvConvert( &_k, distCoeffs );

-    for( i = 0; i < nimages; i++ )
+    for( i = 0, pos = 0; i < nimages; i++ )
    {
        CvMat src, dst;
+        if( perViewErrors )
+        {
+            ni = npoints->data.i[i*npstep];
+            perViewErrors->data.db[i] = std::sqrt(cv::norm(allErrors.colRange(pos, pos + ni),
+                                                           NORM_L2SQR) / ni);
+            pos+=ni;
+        }
+
        if( rvecs )
        {
            src = cvMat( 3, 1, CV_64F, solver.param->data.db + NINTRINSIC + i*6 );
@@ -1622,6 +1701,17 @@ CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,
 }


+/* finds intrinsic and extrinsic camera parameters
+   from a few views of known calibration pattern */
+CV_IMPL double cvCalibrateCamera2( const CvMat* objectPoints,
+                    const CvMat* imagePoints, const CvMat* npoints,
+                    CvSize imageSize, CvMat* cameraMatrix, CvMat* distCoeffs,
+                    CvMat* rvecs, CvMat* tvecs, int flags, CvTermCriteria termCrit )
+{
+    return cvCalibrateCamera2Internal(objectPoints, imagePoints, npoints, imageSize, cameraMatrix,
+                                      distCoeffs, rvecs, tvecs, NULL, NULL, flags, termCrit);
+}
+
 void cvCalibrationMatrixValues( const CvMat *calibMatr, CvSize imgSize,
    double apertureWidth, double apertureHeight, double *fovx, double *fovy,
    double *focalLength, CvPoint2D64f *principalPoint, double *pasp )
@@ -1772,7 +1862,7 @@ double cvStereoCalibrate( const CvMat* _objectPoints, const CvMat* _imagePoints1
        if( !(flags & (CV_CALIB_FIX_INTRINSIC|CV_CALIB_USE_INTRINSIC_GUESS)))
        {
            cvCalibrateCamera2( objectPoints, imagePoints[k],
-                npoints, imageSize, &K[k], &Dist[k], 0, 0, flags );
+                npoints, imageSize, &K[k], &Dist[k], NULL, NULL, flags );
        }
    }

@@ -3091,7 +3181,6 @@ static void collectCalibrationData( InputArrayOfArrays objectPoints,
    }
 }

-
 static Mat prepareCameraMatrix(Mat& cameraMatrix0, int rtype)
 {
    Mat cameraMatrix = Mat::eye(3, 3, rtype);
@@ -3287,10 +3376,23 @@ cv::Mat cv::initCameraMatrix2D( InputArrayOfArrays objectPoints,
 }


+
 double cv::calibrateCamera( InputArrayOfArrays _objectPoints,
                            InputArrayOfArrays _imagePoints,
                            Size imageSize, InputOutputArray _cameraMatrix, InputOutputArray _distCoeffs,
                            OutputArrayOfArrays _rvecs, OutputArrayOfArrays _tvecs, int flags, TermCriteria criteria )
+{
+    return calibrateCamera(_objectPoints, _imagePoints, imageSize, _cameraMatrix, _distCoeffs,
+                                         _rvecs, _tvecs, noArray(), noArray(), noArray(), flags, criteria);
+}
+
+double cv::calibrateCamera(InputArrayOfArrays _objectPoints,
+                            InputArrayOfArrays _imagePoints,
+                            Size imageSize, InputOutputArray _cameraMatrix, InputOutputArray _distCoeffs,
+                            OutputArrayOfArrays _rvecs, OutputArrayOfArrays _tvecs,
+                            OutputArray stdDeviationsIntrinsics,
+                            OutputArray stdDeviationsExtrinsics,
+                            OutputArray _perViewErrors, int flags, TermCriteria criteria )
 {
    int rtype = CV_64F;
    Mat cameraMatrix = _cameraMatrix.getMat();
@@ -3304,14 +3406,17 @@ double cv::calibrateCamera( InputArrayOfArrays _objectPoints,

    int nimages = int(_objectPoints.total());
    CV_Assert( nimages > 0 );
-    Mat objPt, imgPt, npoints, rvecM, tvecM;
+    Mat objPt, imgPt, npoints, rvecM, tvecM, stdDeviationsM, errorsM;

-    bool rvecs_needed = _rvecs.needed(), tvecs_needed = _tvecs.needed();
+    bool rvecs_needed = _rvecs.needed(), tvecs_needed = _tvecs.needed(),
+            stddev_needed = stdDeviationsIntrinsics.needed(), errors_needed = _perViewErrors.needed(),
+            stddev_ext_needed = stdDeviationsExtrinsics.needed();

    bool rvecs_mat_vec = _rvecs.isMatVector();
    bool tvecs_mat_vec = _tvecs.isMatVector();

-    if( rvecs_needed ) {
+    if( rvecs_needed )
+    {
        _rvecs.create(nimages, 1, CV_64FC3);

        if(rvecs_mat_vec)
@@ -3320,7 +3425,8 @@ double cv::calibrateCamera( InputArrayOfArrays _objectPoints,
            rvecM = _rvecs.getMat();
    }

-    if( tvecs_needed ) {
+    if( tvecs_needed )
+    {
        _tvecs.create(nimages, 1, CV_64FC3);

        if(tvecs_mat_vec)
@@ -3329,16 +3435,46 @@ double cv::calibrateCamera( InputArrayOfArrays _objectPoints,
            tvecM = _tvecs.getMat();
    }

+    if( stddev_needed || stddev_ext_needed )
+    {
+        stdDeviationsM.create(nimages*6 + CV_CALIB_NINTRINSIC, 1, CV_64F);
+    }
+
+    if( errors_needed )
+    {
+        _perViewErrors.create(nimages, 1, CV_64F);
+        errorsM = _perViewErrors.getMat();
+    }
+
    collectCalibrationData( _objectPoints, _imagePoints, noArray(),
                            objPt, imgPt, 0, npoints );
    CvMat c_objPt = objPt, c_imgPt = imgPt, c_npoints = npoints;
    CvMat c_cameraMatrix = cameraMatrix, c_distCoeffs = distCoeffs;
-    CvMat c_rvecM = rvecM, c_tvecM = tvecM;
+    CvMat c_rvecM = rvecM, c_tvecM = tvecM, c_stdDev = stdDeviationsM, c_errors = errorsM;

-    double reprojErr = cvCalibrateCamera2(&c_objPt, &c_imgPt, &c_npoints, imageSize,
+    double reprojErr = cvCalibrateCamera2Internal(&c_objPt, &c_imgPt, &c_npoints, imageSize,
                                          &c_cameraMatrix, &c_distCoeffs,
                                          rvecs_needed ? &c_rvecM : NULL,
-                                          tvecs_needed ? &c_tvecM : NULL, flags, criteria );
+                                          tvecs_needed ? &c_tvecM : NULL,
+                                          stddev_needed ? &c_stdDev : NULL,
+                                          errors_needed ? &c_errors : NULL, flags, criteria );
+
+    if( stddev_needed )
+    {
+        stdDeviationsIntrinsics.create(CV_CALIB_NINTRINSIC, 1, CV_64F);
+        Mat stdDeviationsIntrinsicsMat = stdDeviationsIntrinsics.getMat();
+        std::memcpy(stdDeviationsIntrinsicsMat.ptr(), stdDeviationsM.ptr(),
+                    CV_CALIB_NINTRINSIC*sizeof(double));
+    }
+
+    if ( stddev_ext_needed )
+    {
+        stdDeviationsExtrinsics.create(nimages*6, 1, CV_64F);
+        Mat stdDeviationsExtrinsicsMat = stdDeviationsExtrinsics.getMat();
+        std::memcpy(stdDeviationsExtrinsicsMat.ptr(),
+                    stdDeviationsM.ptr() + CV_CALIB_NINTRINSIC*sizeof(double),
+                    nimages*6*sizeof(double));
+    }

    // overly complicated and inefficient rvec/ tvec handling to support vector<Mat>
    for(int i = 0; i < nimages; i++ )
--- a/modules/calib3d/src/compat_ptsetreg.cpp
+++ b/modules/calib3d/src/compat_ptsetreg.cpp
@@ -241,6 +241,8 @@ bool CvLevMarq::updateAlt( const CvMat*& _param, CvMat*& _JtJ, CvMat*& _JtErr, d
        cvNorm(param, prevParam, CV_RELATIVE_L2) < criteria.epsilon )
    {
        _param = param;
+        _JtJ = JtJ;
+        _JtErr = JtErr;
        state = DONE;
        return false;
    }
--- a/modules/calib3d/test/test_cameracalibration.cpp
+++ b/modules/calib3d/test/test_cameracalibration.cpp
@@ -259,7 +259,7 @@ protected:
    virtual void calibrate( int imageCount, int* pointCounts,
        CvSize imageSize, CvPoint2D64f* imagePoints, CvPoint3D64f* objectPoints,
        double* distortionCoeffs, double* cameraMatrix, double* translationVectors,
-        double* rotationMatrices, int flags ) = 0;
+        double* rotationMatrices, double *stdDevs, double* perViewErrors, int flags ) = 0;
    virtual void project( int pointCount, CvPoint3D64f* objectPoints,
        double* rotationMatrix, double*  translationVector,
        double* cameraMatrix, double* distortion, CvPoint2D64f* imagePoints ) = 0;
@@ -303,9 +303,13 @@ void CV_CameraCalibrationTest::run( int start_from )

    double*       transVects;
    double*       rotMatrs;
+    double*       stdDevs;
+    double*       perViewErrors;

    double*       goodTransVects;
    double*       goodRotMatrs;
+    double*       goodPerViewErrors;
+    double*       goodStdDevs;

    double          cameraMatrix[3*3];
    double          distortion[5]={0,0,0,0,0};
@@ -424,9 +428,13 @@ void CV_CameraCalibrationTest::run( int start_from )
        /* Allocate memory for translate vectors and rotmatrixs*/
        transVects     = (double*)cvAlloc(3 * 1 * numImages * sizeof(double));
        rotMatrs       = (double*)cvAlloc(3 * 3 * numImages * sizeof(double));
+        stdDevs        = (double*)cvAlloc((CV_CALIB_NINTRINSIC + 6*numImages) * sizeof(double));
+        perViewErrors  = (double*)cvAlloc(numImages * sizeof(double));

        goodTransVects = (double*)cvAlloc(3 * 1 * numImages * sizeof(double));
        goodRotMatrs   = (double*)cvAlloc(3 * 3 * numImages * sizeof(double));
+        goodPerViewErrors  = (double*)cvAlloc(numImages * sizeof(double));
+        goodStdDevs = (double*)cvAlloc((CV_CALIB_NINTRINSIC + 6*numImages) * sizeof(double));

        /* Read object points */
        i = 0;/* shift for current point */
@@ -501,6 +509,13 @@ void CV_CameraCalibrationTest::run( int start_from )
            }
        }

+        /* Read good stdDeviations */
+        for (i = 0; i < CV_CALIB_NINTRINSIC + numImages*6; i++)
+        {
+            values_read = fscanf(file, "%lf", goodStdDevs + i);
+            CV_Assert(values_read == 1);
+        }
+
        calibFlags = 0
                     // + CV_CALIB_FIX_PRINCIPAL_POINT
                     // + CV_CALIB_ZERO_TANGENT_DIST
@@ -526,6 +541,8 @@ void CV_CameraCalibrationTest::run( int start_from )
                    cameraMatrix,
                    transVects,
                    rotMatrs,
+                    stdDevs,
+                    perViewErrors,
                    calibFlags );

        /* ---- Reproject points to the image ---- */
@@ -553,6 +570,8 @@ void CV_CameraCalibrationTest::run( int start_from )
        meanDy = 0;
        for( currImage = 0; currImage < numImages; currImage++ )
        {
+            double imageMeanDx = 0;
+            double imageMeanDy = 0;
            for( currPoint = 0; currPoint < etalonSize.width * etalonSize.height; currPoint++ )
            {
                rx = reprojectPoints[i].x;
@@ -563,6 +582,9 @@ void CV_CameraCalibrationTest::run( int start_from )
                meanDx += dx;
                meanDy += dy;

+                imageMeanDx += dx*dx;
+                imageMeanDy += dy*dy;
+
                dx = fabs(dx);
                dy = fabs(dy);

@@ -573,6 +595,13 @@ void CV_CameraCalibrationTest::run( int start_from )
                    maxDy = dy;
                i++;
            }
+            goodPerViewErrors[currImage] = sqrt( (imageMeanDx + imageMeanDy) /
+                                           (etalonSize.width * etalonSize.height));
+
+            //only for c-version of test (it does not provides evaluation of perViewErrors
+            //and returns zeros)
+            if(perViewErrors[currImage] == 0.0)
+                perViewErrors[currImage] = goodPerViewErrors[currImage];
        }

        meanDx /= numImages * etalonSize.width * etalonSize.height;
@@ -613,6 +642,23 @@ void CV_CameraCalibrationTest::run( int start_from )
        if( code < 0 )
            goto _exit_;

+        /* ----- Compare per view re-projection errors ----- */
+        code = compare(perViewErrors,goodPerViewErrors, numImages,0.1,"per view errors vector");
+        if( code < 0 )
+            goto _exit_;
+
+        /* ----- Compare standard deviations of parameters ----- */
+        //only for c-version of test (it does not provides evaluation of stdDevs
+        //and returns zeros)
+        for ( i = 0; i < CV_CALIB_NINTRINSIC + 6*numImages; i++)
+        {
+            if(stdDevs[i] == 0.0)
+                stdDevs[i] = goodStdDevs[i];
+        }
+        code = compare(stdDevs,goodStdDevs, CV_CALIB_NINTRINSIC + 6*numImages,.5,"stdDevs vector");
+        if( code < 0 )
+            goto _exit_;
+
        if( maxDx > 1.0 )
        {
            ts->printf( cvtest::TS::LOG,
@@ -636,8 +682,12 @@ void CV_CameraCalibrationTest::run( int start_from )

        cvFree(&transVects);
        cvFree(&rotMatrs);
+        cvFree(&stdDevs);
+        cvFree(&perViewErrors);
        cvFree(&goodTransVects);
        cvFree(&goodRotMatrs);
+        cvFree(&goodPerViewErrors);
+        cvFree(&goodStdDevs);

        fclose(file);
        file = 0;
@@ -676,20 +726,28 @@ protected:
    virtual void calibrate( int imageCount, int* pointCounts,
        CvSize imageSize, CvPoint2D64f* imagePoints, CvPoint3D64f* objectPoints,
        double* distortionCoeffs, double* cameraMatrix, double* translationVectors,
-        double* rotationMatrices, int flags );
+        double* rotationMatrices, double *stdDevs, double* perViewErrors, int flags );
    virtual void project( int pointCount, CvPoint3D64f* objectPoints,
        double* rotationMatrix, double*  translationVector,
        double* cameraMatrix, double* distortion, CvPoint2D64f* imagePoints );
 };

-void CV_CameraCalibrationTest_C::calibrate( int imageCount, int* pointCounts,
+void CV_CameraCalibrationTest_C::calibrate(int imageCount, int* pointCounts,
        CvSize imageSize, CvPoint2D64f* imagePoints, CvPoint3D64f* objectPoints,
        double* distortionCoeffs, double* cameraMatrix, double* translationVectors,
-        double* rotationMatrices, int flags )
+        double* rotationMatrices, double *stdDevs, double *perViewErrors, int flags )
 {
    int i, total = 0;
    for( i = 0; i < imageCount; i++ )
+    {
+        perViewErrors[i] = 0.0;
        total += pointCounts[i];
+    }
+
+    for( i = 0; i < CV_CALIB_NINTRINSIC + imageCount*6; i++)
+    {
+        stdDevs[i] = 0.0;
+    }

    CvMat _objectPoints = cvMat(1, total, CV_64FC3, objectPoints);
    CvMat _imagePoints = cvMat(1, total, CV_64FC2, imagePoints);
@@ -700,8 +758,7 @@ void CV_CameraCalibrationTest_C::calibrate( int imageCount, int* pointCounts,
    CvMat _translationVectors = cvMat(imageCount, 3, CV_64F, translationVectors);

    cvCalibrateCamera2(&_objectPoints, &_imagePoints, &_pointCounts, imageSize,
-                       &_cameraMatrix, &_distCoeffs, &_rotationMatrices, &_translationVectors,
-                       flags);
+                       &_cameraMatrix, &_distCoeffs, &_rotationMatrices, &_translationVectors, flags);
 }

 void CV_CameraCalibrationTest_C::project( int pointCount, CvPoint3D64f* objectPoints,
@@ -728,22 +785,24 @@ protected:
    virtual void calibrate( int imageCount, int* pointCounts,
        CvSize imageSize, CvPoint2D64f* imagePoints, CvPoint3D64f* objectPoints,
        double* distortionCoeffs, double* cameraMatrix, double* translationVectors,
-        double* rotationMatrices, int flags );
+        double* rotationMatrices, double *stdDevs, double* perViewErrors,  int flags );
    virtual void project( int pointCount, CvPoint3D64f* objectPoints,
        double* rotationMatrix, double*  translationVector,
        double* cameraMatrix, double* distortion, CvPoint2D64f* imagePoints );
 };

-void CV_CameraCalibrationTest_CPP::calibrate( int imageCount, int* pointCounts,
+void CV_CameraCalibrationTest_CPP::calibrate(int imageCount, int* pointCounts,
        CvSize _imageSize, CvPoint2D64f* _imagePoints, CvPoint3D64f* _objectPoints,
        double* _distortionCoeffs, double* _cameraMatrix, double* translationVectors,
-        double* rotationMatrices, int flags )
+        double* rotationMatrices, double *stdDevs, double *perViewErrors, int flags )
 {
    vector<vector<Point3f> > objectPoints( imageCount );
    vector<vector<Point2f> > imagePoints( imageCount );
    Size imageSize = _imageSize;
    Mat cameraMatrix, distCoeffs(1,4,CV_64F,Scalar::all(0));
    vector<Mat> rvecs, tvecs;
+    Mat stdDevsMatInt, stdDevsMatExt;
+    Mat perViewErrorsMat;

    CvPoint3D64f* op = _objectPoints;
    CvPoint2D64f* ip = _imagePoints;
@@ -770,8 +829,23 @@ void CV_CameraCalibrationTest_CPP::calibrate( int imageCount, int* pointCounts,
                     distCoeffs,
                     rvecs,
                     tvecs,
+                     stdDevsMatInt,
+                     stdDevsMatExt,
+                     perViewErrorsMat,
                     flags );

+    assert( stdDevsMatInt.type() == CV_64F );
+    assert( stdDevsMatInt.total() == static_cast<size_t>(CV_CALIB_NINTRINSIC) );
+    memcpy( stdDevs, stdDevsMatInt.ptr(), CV_CALIB_NINTRINSIC*sizeof(double) );
+
+    assert( stdDevsMatExt.type() == CV_64F );
+    assert( stdDevsMatExt.total() == static_cast<size_t>(6*imageCount) );
+    memcpy( stdDevs + CV_CALIB_NINTRINSIC, stdDevsMatExt.ptr(), 6*imageCount*sizeof(double) );
+
+    assert( perViewErrorsMat.type() == CV_64F);
+    assert( perViewErrorsMat.total() == static_cast<size_t>(imageCount) );
+    memcpy( perViewErrors, perViewErrorsMat.ptr(), imageCount*sizeof(double) );
+
    assert( cameraMatrix.type() == CV_64FC1 );
    memcpy( _cameraMatrix, cameraMatrix.ptr(), 9*sizeof(double) );

--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(the_description "The Core Functionality")
 ocv_add_module(core
-               PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}"
+               PRIVATE_REQUIRED ${ZLIB_LIBRARIES} "${OPENCL_LIBRARIES}" "${VA_LIBRARIES}" "${OPENCV_HAL_LINKER_LIBS}"
               OPTIONAL opencv_cudev
               WRAP java python)

--- a/modules/core/include/opencv2/core/hal/hal.hpp
+++ b/modules/core/include/opencv2/core/hal/hal.hpp
@@ -49,17 +49,6 @@
 #include "opencv2/core/cvstd.hpp"
 #include "opencv2/core/hal/interface.h"

-//! @cond IGNORED
-#define CALL_HAL(name, fun, ...) \
-    int res = fun(__VA_ARGS__); \
-    if (res == CV_HAL_ERROR_OK) \
-        return; \
-    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
-        CV_Error_(cv::Error::StsInternal, \
-            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
-//! @endcond
-
-
 namespace cv { namespace hal {

 //! @addtogroup core_hal_functions
@@ -75,6 +64,21 @@ CV_EXPORTS int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int
 CV_EXPORTS int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
 CV_EXPORTS bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n);
 CV_EXPORTS bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n);
+CV_EXPORTS void SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int flags);
+CV_EXPORTS void SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int flags);
+
+CV_EXPORTS void gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+CV_EXPORTS void gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+CV_EXPORTS void gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);
+CV_EXPORTS void gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags);

 CV_EXPORTS int normL1_(const uchar* a, const uchar* b, int n);
 CV_EXPORTS float normL1_(const float* a, const float* b, int n);
--- a/modules/core/include/opencv2/core/hal/interface.h
+++ b/modules/core/include/opencv2/core/hal/interface.h
@@ -158,6 +158,21 @@ typedef signed char schar;
 #define CV_HAL_DFT_IS_INPLACE 1024
 //! @}

+//! @name SVD flags
+//! @{
+#define CV_HAL_SVD_NO_UV    1
+#define CV_HAL_SVD_SHORT_UV 2
+#define CV_HAL_SVD_MODIFY_A 4
+#define CV_HAL_SVD_FULL_UV  8
+//! @}
+
+//! @name Gemm flags
+//! @{
+#define CV_HAL_GEMM_1_T 1
+#define CV_HAL_GEMM_2_T 2
+#define CV_HAL_GEMM_3_T 4
+//! @}
+
 //! @}

 #endif
--- a/modules/core/include/opencv2/core/mat.hpp
+++ b/modules/core/include/opencv2/core/mat.hpp
@@ -2880,9 +2880,9 @@ public:
    //! copy operator
    MatConstIterator_& operator = (const MatConstIterator_& it);
    //! returns the current matrix element
-    _Tp operator *() const;
+    const _Tp& operator *() const;
    //! returns the i-th matrix element, relative to the current
-    _Tp operator [](ptrdiff_t i) const;
+    const _Tp& operator [](ptrdiff_t i) const;

    //! shifts the iterator forward by the specified number of elements
    MatConstIterator_& operator += (ptrdiff_t ofs);
--- a/modules/core/include/opencv2/core/mat.inl.hpp
+++ b/modules/core/include/opencv2/core/mat.inl.hpp
@@ -2550,7 +2550,7 @@ MatConstIterator_<_Tp>& MatConstIterator_<_Tp>::operator = (const MatConstIterat
 }

 template<typename _Tp> inline
-_Tp MatConstIterator_<_Tp>::operator *() const
+const _Tp& MatConstIterator_<_Tp>::operator *() const
 {
    return *(_Tp*)(this->ptr);
 }
@@ -2656,7 +2656,7 @@ MatConstIterator_<_Tp> operator - (const MatConstIterator_<_Tp>& a, ptrdiff_t of
 }

 template<typename _Tp> inline
-_Tp MatConstIterator_<_Tp>::operator [](ptrdiff_t i) const
+const _Tp& MatConstIterator_<_Tp>::operator [](ptrdiff_t i) const
 {
    return *(_Tp*)MatConstIterator::operator [](i);
 }
--- a/modules/core/include/opencv2/core/matx.hpp
+++ b/modules/core/include/opencv2/core/matx.hpp
@@ -438,7 +438,7 @@ template<typename _Tp, int m> struct Matx_DetOp
            return p;
        for( int i = 0; i < m; i++ )
            p *= temp(i, i);
-        return 1./p;
+        return p;
    }
 };

--- a/modules/core/src/hal_internal.cpp
+++ b/modules/core/src/hal_internal.cpp
@@ -0,0 +1,485 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#include "hal_internal.hpp"
+
+#ifdef HAVE_LAPACK
+
+#include <cmath>
+#include <lapacke.h>
+#include <cblas.h>
+#include <algorithm>
+#include <typeinfo>
+#include <limits>
+#include <complex>
+
+#define HAL_GEMM_SMALL_COMPLEX_MATRIX_THRESH 100
+#define HAL_GEMM_SMALL_MATRIX_THRESH 100
+#define HAL_SVD_SMALL_MATRIX_THRESH 25
+#define HAL_LU_SMALL_MATRIX_THRESH 100
+#define HAL_CHOLESKY_SMALL_MATRIX_THRESH 100
+
+//lapack stores matrices in column-major order so transposing is neded everywhere
+template <typename fptype> static inline void
+transpose_square_inplace(fptype *src, size_t src_ld, size_t m)
+{
+    for(size_t i = 0; i < m - 1; i++)
+        for(size_t j = i + 1; j < m; j++)
+            std::swap(src[j*src_ld + i], src[i*src_ld + j]);
+}
+
+template <typename fptype> static inline void
+transpose(const fptype *src, size_t src_ld, fptype* dst, size_t dst_ld, size_t m, size_t n)
+{
+    for(size_t i = 0; i < m; i++)
+        for(size_t j = 0; j < n; j++)
+            dst[j*dst_ld + i] = src[i*src_ld + j];
+}
+
+template <typename fptype> static inline void
+copy_matrix(const fptype *src, size_t src_ld, fptype* dst, size_t dst_ld, size_t m, size_t n)
+{
+    for(size_t i = 0; i < m; i++)
+        for(size_t j = 0; j < n; j++)
+            dst[i*dst_ld + j] = src[i*src_ld + j];
+}
+
+template <typename fptype> static inline void
+set_value(fptype *dst, size_t dst_ld, fptype value, size_t m, size_t n)
+{
+    for(size_t i = 0; i < m; i++)
+        for(size_t j = 0; j < n; j++)
+            dst[i*dst_ld + j] = value;
+}
+
+template <typename fptype> static inline int
+lapack_LU(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, int* info)
+{
+    int lda = a_step / sizeof(fptype), sign = 0;
+    int* piv = new int[m];
+
+    transpose_square_inplace(a, lda, m);
+
+    if(b)
+    {
+        if(n == 1 && b_step == sizeof(fptype))
+        {
+            if(typeid(fptype) == typeid(float))
+                sgesv_(&m, &n, (float*)a, &lda, piv, (float*)b, &m, info);
+            else if(typeid(fptype) == typeid(double))
+                dgesv_(&m, &n, (double*)a, &lda, piv, (double*)b, &m, info);
+        }
+        else
+        {
+            int ldb = b_step / sizeof(fptype);
+            fptype* tmpB = new fptype[m*n];
+
+            transpose(b, ldb, tmpB, m, m, n);
+
+            if(typeid(fptype) == typeid(float))
+                sgesv_(&m, &n, (float*)a, &lda, piv, (float*)tmpB, &m, info);
+            else if(typeid(fptype) == typeid(double))
+                dgesv_(&m, &n, (double*)a, &lda, piv, (double*)tmpB, &m, info);
+
+            transpose(tmpB, m, b, ldb, n, m);
+            delete[] tmpB;
+        }
+    }
+    else
+    {
+        if(typeid(fptype) == typeid(float))
+            sgetrf_(&m, &m, (float*)a, &lda, piv, info);
+        else if(typeid(fptype) == typeid(double))
+            dgetrf_(&m, &m, (double*)a, &lda, piv, info);
+    }
+
+    if(*info == 0)
+    {
+        for(int i = 0; i < m; i++)
+            sign ^= piv[i] != i + 1;
+        *info = sign ? -1 : 1;
+    }
+    else
+        *info = 0; //in opencv LU function zero means error
+
+    delete[] piv;
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename fptype> static inline int
+lapack_Cholesky(fptype* a, size_t a_step, int m, fptype* b, size_t b_step, int n, bool* info)
+{
+    int lapackStatus;
+    int lda = a_step / sizeof(fptype);
+    char L[] = {'L', '\0'};
+
+    if(b)
+    {
+        if(n == 1 && b_step == sizeof(fptype))
+        {
+            if(typeid(fptype) == typeid(float))
+                sposv_(L, &m, &n, (float*)a, &lda, (float*)b, &m, &lapackStatus);
+            else if(typeid(fptype) == typeid(double))
+                dposv_(L, &m, &n, (double*)a, &lda, (double*)b, &m, &lapackStatus);
+        }
+        else
+        {
+            int ldb = b_step / sizeof(fptype);
+            fptype* tmpB = new fptype[m*n];
+            transpose(b, ldb, tmpB, m, m, n);
+
+            if(typeid(fptype) == typeid(float))
+                sposv_(L, &m, &n, (float*)a, &lda, (float*)tmpB, &m, &lapackStatus);
+            else if(typeid(fptype) == typeid(double))
+                dposv_(L, &m, &n, (double*)a, &lda, (double*)tmpB, &m, &lapackStatus);
+
+            transpose(tmpB, m, b, ldb, n, m);
+            delete[] tmpB;
+        }
+    }
+    else
+    {
+        if(typeid(fptype) == typeid(float))
+            spotrf_(L, &m, (float*)a, &lda, &lapackStatus);
+        else if(typeid(fptype) == typeid(double))
+            dpotrf_(L, &m, (double*)a, &lda, &lapackStatus);
+    }
+
+    if(lapackStatus == 0) *info = true;
+    else *info = false; //in opencv Cholesky function false means error
+
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename fptype> static inline int
+lapack_SVD(fptype* a, size_t a_step, fptype *w, fptype* u, size_t u_step, fptype* vt, size_t v_step, int m, int n, int flags, int* info)
+{
+    int lda = a_step / sizeof(fptype);
+    int ldv = v_step / sizeof(fptype);
+    int ldu = u_step / sizeof(fptype);
+    int lwork = -1;
+    int* iworkBuf = new int[8*std::min(m, n)];
+    fptype work1 = 0;
+
+    //A already transposed and m>=n
+    char mode[] = { ' ', '\0'};
+    if(flags & CV_HAL_SVD_NO_UV)
+    {
+        ldv = 1;
+        mode[0] = 'N';
+    }
+    else if((flags & CV_HAL_SVD_SHORT_UV) && (flags & CV_HAL_SVD_MODIFY_A)) //short SVD, U stored in a
+        mode[0] = 'O';
+    else if((flags & CV_HAL_SVD_SHORT_UV) && !(flags & CV_HAL_SVD_MODIFY_A)) //short SVD, U stored in u if m>=n
+        mode[0] = 'S';
+    else if(flags & CV_HAL_SVD_FULL_UV) //full SVD, U stored in u or in a
+        mode[0] = 'A';
+
+    if((flags & CV_HAL_SVD_MODIFY_A) && (flags & CV_HAL_SVD_FULL_UV)) //U stored in a
+    {
+        u = new fptype[m*m];
+        ldu = m;
+    }
+
+    if(typeid(fptype) == typeid(float))
+        sgesdd_(mode, &m, &n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)&work1, &lwork, iworkBuf, info);
+    else if(typeid(fptype) == typeid(double))
+        dgesdd_(mode, &m, &n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)&work1, &lwork, iworkBuf, info);
+
+    lwork = round(work1); //optimal buffer size
+    fptype* buffer = new fptype[lwork + 1];
+
+    if(typeid(fptype) == typeid(float))
+        sgesdd_(mode, &m, &n, (float*)a, &lda, (float*)w, (float*)u, &ldu, (float*)vt, &ldv, (float*)buffer, &lwork, iworkBuf, info);
+    else if(typeid(fptype) == typeid(double))
+        dgesdd_(mode, &m, &n, (double*)a, &lda, (double*)w, (double*)u, &ldu, (double*)vt, &ldv, (double*)buffer, &lwork, iworkBuf, info);
+
+    if(!(flags & CV_HAL_SVD_NO_UV))
+        transpose_square_inplace(vt, ldv, n);
+
+    if((flags & CV_HAL_SVD_MODIFY_A) && (flags & CV_HAL_SVD_FULL_UV))
+    {
+        for(int i = 0; i < m; i++)
+            for(int j = 0; j < m; j++)
+                a[i*lda + j] = u[i*m + j];
+        delete[] u;
+    }
+
+    delete[] iworkBuf;
+    delete[] buffer;
+    return CV_HAL_ERROR_OK;
+}
+
+template <typename fptype> static inline int
+lapack_gemm(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
+            const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags)
+{
+    int ldsrc1 = src1_step / sizeof(fptype);
+    int ldsrc2 = src2_step / sizeof(fptype);
+    int ldsrc3 = src3_step / sizeof(fptype);
+    int lddst = dst_step / sizeof(fptype);
+    int c_m, c_n, d_m;
+    CBLAS_TRANSPOSE transA, transB;
+
+    if(flags & CV_HAL_GEMM_2_T)
+    {
+        transB = CblasTrans;
+        if(flags & CV_HAL_GEMM_1_T )
+        {
+            d_m = a_n;
+        }
+        else
+        {
+            d_m = a_m;
+        }
+    }
+    else
+    {
+        transB = CblasNoTrans;
+        if(flags & CV_HAL_GEMM_1_T )
+        {
+            d_m = a_n;
+        }
+        else
+        {
+            d_m = a_m;
+        }
+    }
+
+    if(flags & CV_HAL_GEMM_3_T)
+    {
+        c_m = d_n;
+        c_n = d_m;
+    }
+    else
+    {
+        c_m = d_m;
+        c_n = d_n;
+    }
+
+    if(flags & CV_HAL_GEMM_1_T )
+    {
+        transA = CblasTrans;
+        std::swap(a_n, a_m);
+    }
+    else
+    {
+        transA = CblasNoTrans;
+    }
+
+    if(src3 != dst && beta != 0.0 && src3_step != 0) {
+        if(flags & CV_HAL_GEMM_3_T)
+            transpose(src3, ldsrc3, dst, lddst, c_m, c_n);
+        else
+            copy_matrix(src3, ldsrc3, dst, lddst, c_m, c_n);
+    }
+    else if (src3 == dst && (flags & CV_HAL_GEMM_3_T)) //actually transposing C in this case done by openCV
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    else if(src3_step == 0 && beta != 0.0)
+        set_value(dst, lddst, (fptype)0.0, d_m, d_n);
+
+    if(typeid(fptype) == typeid(float))
+        cblas_sgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (float)alpha, (float*)src1, ldsrc1, (float*)src2, ldsrc2, (float)beta, (float*)dst, lddst);
+    else if(typeid(fptype) == typeid(double))
+        cblas_dgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, (double)alpha, (double*)src1, ldsrc1, (double*)src2, ldsrc2, (double)beta, (double*)dst, lddst);
+
+    return CV_HAL_ERROR_OK;
+}
+
+
+template <typename fptype> static inline int
+lapack_gemm_c(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
+            const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int a_m, int a_n, int d_n, int flags)
+{
+    int ldsrc1 = src1_step / sizeof(std::complex<fptype>);
+    int ldsrc2 = src2_step / sizeof(std::complex<fptype>);
+    int ldsrc3 = src3_step / sizeof(std::complex<fptype>);
+    int lddst = dst_step / sizeof(std::complex<fptype>);
+    int c_m, c_n, d_m;
+    CBLAS_TRANSPOSE transA, transB;
+    std::complex<fptype> cAlpha(alpha, 0.0);
+    std::complex<fptype> cBeta(beta, 0.0);
+
+    if(flags & CV_HAL_GEMM_2_T)
+    {
+        transB = CblasTrans;
+        if(flags & CV_HAL_GEMM_1_T )
+        {
+            d_m = a_n;
+        }
+        else
+        {
+            d_m = a_m;
+        }
+    }
+    else
+    {
+        transB = CblasNoTrans;
+        if(flags & CV_HAL_GEMM_1_T )
+        {
+            d_m = a_n;
+        }
+        else
+        {
+            d_m = a_m;
+        }
+    }
+
+    if(flags & CV_HAL_GEMM_3_T)
+    {
+        c_m = d_n;
+        c_n = d_m;
+    }
+    else
+    {
+        c_m = d_m;
+        c_n = d_n;
+    }
+
+    if(flags & CV_HAL_GEMM_1_T )
+    {
+        transA = CblasTrans;
+        std::swap(a_n, a_m);
+    }
+    else
+    {
+        transA = CblasNoTrans;
+    }
+
+    if(src3 != dst && beta != 0.0 && src3_step != 0) {
+        if(flags & CV_HAL_GEMM_3_T)
+            transpose((std::complex<fptype>*)src3, ldsrc3, (std::complex<fptype>*)dst, lddst, c_m, c_n);
+        else
+            copy_matrix((std::complex<fptype>*)src3, ldsrc3, (std::complex<fptype>*)dst, lddst, c_m, c_n);
+    }
+    else if (src3 == dst && (flags & CV_HAL_GEMM_3_T)) //actually transposing C in this case done by openCV
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    else if(src3_step == 0 && beta != 0.0)
+        set_value((std::complex<fptype>*)dst, lddst, std::complex<fptype>(0.0, 0.0), d_m, d_n);
+
+    if(typeid(fptype) == typeid(float))
+        cblas_cgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, &cAlpha, (void*)src1, ldsrc1, (void*)src2, ldsrc2, &cBeta, (void*)dst, lddst);
+    else if(typeid(fptype) == typeid(double))
+        cblas_zgemm(CblasRowMajor, transA, transB, a_m, d_n, a_n, &cAlpha, (void*)src1, ldsrc1, (void*)src2, ldsrc2, &cBeta, (void*)dst, lddst);
+
+    return CV_HAL_ERROR_OK;
+}
+int lapack_LU32f(float* a, size_t a_step, int m, float* b, size_t b_step, int n, int* info)
+{
+    if(m < HAL_LU_SMALL_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_LU(a, a_step, m, b, b_step, n, info);
+}
+
+int lapack_LU64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, int* info)
+{
+    if(m < HAL_LU_SMALL_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_LU(a, a_step, m, b, b_step, n, info);
+}
+
+int lapack_Cholesky32f(float* a, size_t a_step, int m, float* b, size_t b_step, int n, bool *info)
+{
+    if(m < HAL_CHOLESKY_SMALL_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_Cholesky(a, a_step, m, b, b_step, n, info);
+}
+
+int lapack_Cholesky64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, bool *info)
+{
+    if(m < HAL_CHOLESKY_SMALL_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_Cholesky(a, a_step, m, b, b_step, n, info);
+}
+
+int lapack_SVD32f(float* a, size_t a_step, float *w, float* u, size_t u_step, float* vt, size_t v_step, int m, int n, int flags)
+{
+
+    if(m < HAL_SVD_SMALL_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    int info;
+    return lapack_SVD(a, a_step, w, u, u_step, vt, v_step, m, n, flags, &info);
+}
+
+int lapack_SVD64f(double* a, size_t a_step, double *w, double* u, size_t u_step, double* vt, size_t v_step, int m, int n, int flags)
+{
+
+    if(m < HAL_SVD_SMALL_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    int info;
+    return lapack_SVD(a, a_step, w, u, u_step, vt, v_step, m, n, flags, &info);
+}
+
+int lapack_gemm32f(const float *src1, size_t src1_step, const float *src2, size_t src2_step, float alpha,
+                   const float *src3, size_t src3_step, float beta, float *dst, size_t dst_step, int m, int n, int k, int flags)
+{
+    if(m < HAL_GEMM_SMALL_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_gemm(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m, n, k, flags);
+}
+
+int lapack_gemm64f(const double *src1, size_t src1_step, const double *src2, size_t src2_step, double alpha,
+                   const double *src3, size_t src3_step, double beta, double *dst, size_t dst_step, int m, int n, int k, int flags)
+{
+    if(m < HAL_GEMM_SMALL_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_gemm(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m, n, k, flags);
+}
+
+int lapack_gemm32fc(const float *src1, size_t src1_step, const float *src2, size_t src2_step, float alpha,
+                   const float *src3, size_t src3_step, float beta, float *dst, size_t dst_step, int m, int n, int k, int flags)
+{
+    if(m < HAL_GEMM_SMALL_COMPLEX_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_gemm_c(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m, n, k, flags);
+}
+int lapack_gemm64fc(const double *src1, size_t src1_step, const double *src2, size_t src2_step, double alpha,
+                   const double *src3, size_t src3_step, double beta, double *dst, size_t dst_step, int m, int n, int k, int flags)
+{
+    if(m < HAL_GEMM_SMALL_COMPLEX_MATRIX_THRESH)
+        return CV_HAL_ERROR_NOT_IMPLEMENTED;
+    return lapack_gemm_c(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m, n, k, flags);
+}
+
+#endif //HAVE_LAPACK
--- a/modules/core/src/hal_internal.hpp
+++ b/modules/core/src/hal_internal.hpp
@@ -0,0 +1,96 @@
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                          License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Copyright (C) 2013, OpenCV Foundation, all rights reserved.
+// Copyright (C) 2015, Itseez Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+
+#ifndef OPENCV_CORE_HAL_INTERNAL_HPP
+#define OPENCV_CORE_HAL_INTERNAL_HPP
+
+#include "precomp.hpp"
+
+#ifdef HAVE_LAPACK
+
+int lapack_LU32f(float* a, size_t a_step, int m, float* b, size_t b_step, int n, int* info);
+int lapack_LU64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, int* info);
+int lapack_Cholesky32f(float* a, size_t a_step, int m, float* b, size_t b_step, int n, bool* info);
+int lapack_Cholesky64f(double* a, size_t a_step, int m, double* b, size_t b_step, int n, bool* info);
+int lapack_SVD32f(float* a, size_t a_step, float* w, float* u, size_t u_step, float* vt, size_t v_step, int m, int n, int flags);
+int lapack_SVD64f(double* a, size_t a_step, double* w, double* u, size_t u_step, double* vt, size_t v_step, int m, int n, int flags);
+int lapack_gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                   float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                   int m, int n, int k, int flags);
+int lapack_gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                   double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                   int m, int n, int k, int flags);
+int lapack_gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                   float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                   int m, int n, int k, int flags);
+int lapack_gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                   double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                   int m, int n, int k, int flags);
+
+#undef cv_hal_LU32f
+#define cv_hal_LU32f lapack_LU32f
+#undef cv_hal_LU64f
+#define cv_hal_LU64f lapack_LU64f
+
+#undef cv_hal_Cholesky32f
+#define cv_hal_Cholesky32f lapack_Cholesky32f
+#undef cv_hal_Cholesky64f
+#define cv_hal_Cholesky64f lapack_Cholesky64f
+
+#undef cv_hal_SVD32f
+#define cv_hal_SVD32f lapack_SVD32f
+#undef cv_hal_SVD64f
+#define cv_hal_SVD64f lapack_SVD64f
+
+#undef cv_hal_gemm32f
+#define cv_hal_gemm32f lapack_gemm32f
+#undef cv_hal_gemm64f
+#define cv_hal_gemm64f lapack_gemm64f
+#undef cv_hal_gemm32fc
+#define cv_hal_gemm32fc lapack_gemm32fc
+#undef cv_hal_gemm64fc
+#define cv_hal_gemm64fc lapack_gemm64fc
+
+#endif //HAVE_LAPACK
+#endif //OPENCV_CORE_HAL_INTERNAL_HPP
--- a/modules/core/src/hal_replacement.hpp
+++ b/modules/core/src/hal_replacement.hpp
@@ -472,14 +472,148 @@ inline int hal_ni_dctFree2D(cvhalDFT *context) { return CV_HAL_ERROR_NOT_IMPLEME
 #define cv_hal_dctFree2D hal_ni_dctFree2D
 //! @endcond

+
+/**
+Performs \f$LU\f$ decomposition of square matrix \f$A=P*L*U\f$ (where \f$P\f$ is permutation matrix) and solves matrix equation \f$A*X=B\f$.
+Function returns the \f$sign\f$ of permutation \f$P\f$ via parameter info.
+@param src1 pointer to input matrix \f$A\f$ stored in row major order. After finish of work src1 contains at least \f$U\f$ part of \f$LU\f$
+decomposition which is appropriate for determainant calculation: \f$det(A)=sign*\prod_{j=1}^{M}a_{jj}\f$.
+@param src1_step number of bytes each matrix \f$A\f$ row occupies.
+@param m size of square matrix \f$A\f$.
+@param src2 pointer to \f$M\times N\f$ matrix \f$B\f$ which is the right-hand side of system \f$A*X=B\f$. \f$B\f$ stored in row major order.
+If src2 is null pointer only \f$LU\f$ decomposition will be performed. After finish of work src2 contains solution \f$X\f$ of system \f$A*X=B\f$.
+@param src2_step number of bytes each matrix \f$B\f$ row occupies.
+@param n number of right-hand vectors in \f$M\times N\f$ matrix \f$B\f$.
+@param info indicates success of decomposition. If *info is equals to zero decomposition failed, othervise *info is equals to \f$sign\f$.
+ */
+//! @addtogroup core_hal_interface_decomp_lu LU matrix decomposition
+//! @{
+inline int hal_ni_LU32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, int* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_LU64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, int* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
 //! @}

+/**
+Performs Cholesky decomposition of matrix \f$A = L*L^T\f$ and solves matrix equation \f$A*X=B\f$.
+@param src1 pointer to input matrix \f$A\f$ stored in row major order. After finish of work src1 contains lower triangular matrix \f$L\f$.
+@param src1_step number of bytes each matrix \f$A\f$ row occupies.
+@param m size of square matrix \f$A\f$.
+@param src2 pointer to \f$M\times N\f$ matrix \f$B\f$ which is the right-hand side of system \f$A*X=B\f$. B stored in row major order.
+If src2 is null pointer only Cholesky decomposition will be performed. After finish of work src2 contains solution \f$X\f$ of system \f$A*X=B\f$.
+@param src2_step number of bytes each matrix \f$B\f$ row occupies.
+@param n number of right-hand vectors in \f$M\times N\f$ matrix \f$B\f$.
+@param info indicates success of decomposition. If *info is false decomposition failed.
+ */
+
+//! @addtogroup core_hal_interface_decomp_cholesky Cholesky matrix decomposition
+//! @{
+inline int hal_ni_Cholesky32f(float* src1, size_t src1_step, int m, float* src2, size_t src2_step, int n, bool* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_Cholesky64f(double* src1, size_t src1_step, int m, double* src2, size_t src2_step, int n, bool* info) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+/**
+Performs singular value decomposition of \f$M\times N\f$(\f$M>N\f$) matrix \f$A = U*\Sigma*V^T\f$.
+@param src pointer to input \f$M\times N\f$ matrix \f$A\f$ stored in column major order.
+After finish of work src will be filled with rows of \f$U\f$ or not modified (depends of flag CV_HAL_SVD_MODIFY_A).
+@param src_step number of bytes each matrix \f$A\f$ column occupies.
+@param w pointer to array for singular values of matrix \f$A\f$ (i. e. first \f$N\f$ diagonal elements of matrix \f$\Sigma\f$).
+@param u pointer to output \f$M\times N\f$ or \f$M\times M\f$ matrix \f$U\f$ (size depends of flags). Pointer must be valid if flag CV_HAL_SVD_MODIFY_A not used.
+@param u_step number of bytes each matrix \f$U\f$ row occupies.
+@param vt pointer to array for \f$N\times N\f$ matrix \f$V^T\f$.
+@param vt_step number of bytes each matrix \f$V^T\f$ row occupies.
+@param m number fo rows in matrix \f$A\f$.
+@param n number of columns in matrix \f$A\f$.
+@param flags algorithm options (combination of CV_HAL_SVD_FULL_UV, ...).
+ */
+//! @addtogroup core_hal_interface_decomp_svd Singular value matrix decomposition
+//! @{
+inline int hal_ni_SVD32f(float* src, size_t src_step, float* w, float* u, size_t u_step, float* vt, size_t vt_step, int m, int n, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_SVD64f(double* src, size_t src_step, double* w, double* u, size_t u_step, double* vt, size_t vt_step, int m, int n, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+
+
+//! @cond IGNORED
+#define cv_hal_LU32f hal_ni_LU32f
+#define cv_hal_LU64f hal_ni_LU64f
+#define cv_hal_Cholesky32f hal_ni_Cholesky32f
+#define cv_hal_Cholesky64f hal_ni_Cholesky64f
+#define cv_hal_SVD32f hal_ni_SVD32f
+#define cv_hal_SVD64f hal_ni_SVD64f
+//! @endcond
+
+
+/**
+The function performs generalized matrix multiplication similar to the gemm functions in BLAS level 3:
+\f$D = \alpha*AB+\beta*C\f$
+
+@param src1 pointer to input \f$M\times N\f$ matrix \f$A\f$ or \f$A^T\f$ stored in row major order.
+@param src1_step number of bytes each matrix \f$A\f$ or \f$A^T\f$ row occupies.
+@param src2 pointer to input \f$N\times K\f$ matrix \f$B\f$ or \f$B^T\f$ stored in row major order.
+@param src2_step number of bytes each matrix \f$B\f$ or \f$B^T\f$ row occupies.
+@param alpha \f$\alpha\f$ multiplier before \f$AB\f$
+@param src3 pointer to input \f$M\times K\f$ matrix \f$C\f$ or \f$C^T\f$ stored in row major order.
+@param src3_step number of bytes each matrix \f$C\f$ or \f$C^T\f$ row occupies.
+@param beta \f$\beta\f$ multiplier before \f$C\f$
+@param dst pointer to input \f$M\times K\f$ matrix \f$D\f$ stored in row major order.
+@param dst_step number of bytes each matrix \f$D\f$ row occupies.
+@param m number of rows in matrix \f$A\f$ or \f$A^T\f$, equals to number of rows in matrix \f$D\f$
+@param n number of columns in matrix \f$A\f$ or \f$A^T\f$
+@param k number of columns in matrix \f$B\f$ or \f$B^T\f$, equals to number of columns in matrix \f$D\f$
+@param flags algorithm options (combination of CV_HAL_GEMM_1_T, ...).
+ */
+
+//! @addtogroup core_hal_interface_matrix_multiplication Matrix multiplication
+//! @{
+inline int hal_ni_gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                          float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                          int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                          double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                          int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                          float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                          int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+inline int hal_ni_gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                          double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                          int m, int n, int k, int flags) { return CV_HAL_ERROR_NOT_IMPLEMENTED; }
+//! @}
+
+//! @cond IGNORED
+#define cv_hal_gemm32f hal_ni_gemm32f
+#define cv_hal_gemm64f hal_ni_gemm64f
+#define cv_hal_gemm32fc hal_ni_gemm32fc
+#define cv_hal_gemm64fc hal_ni_gemm64fc
+//! @endcond
+
+//! @}
+
+
 #if defined __GNUC__
 #  pragma GCC diagnostic pop
 #elif defined _MSC_VER
 #  pragma warning( pop )
 #endif

+#include "hal_internal.hpp"
 #include "custom_hal.hpp"

+//! @cond IGNORED
+#define CALL_HAL_RET(name, fun, retval, ...) \
+    int res = fun(__VA_ARGS__, &retval); \
+    if (res == CV_HAL_ERROR_OK) \
+        return retval; \
+    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
+        CV_Error_(cv::Error::StsInternal, \
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+
+
+#define CALL_HAL(name, fun, ...) \
+    int res = fun(__VA_ARGS__); \
+    if (res == CV_HAL_ERROR_OK) \
+        return; \
+    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
+        CV_Error_(cv::Error::StsInternal, \
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+//! @endcond
+
 #endif
--- a/modules/core/src/lapack.cpp
+++ b/modules/core/src/lapack.cpp
@@ -570,11 +570,44 @@ JacobiSVDImpl_(_Tp* At, size_t astep, _Tp* _W, _Tp* Vt, size_t vstep,

 static void JacobiSVD(float* At, size_t astep, float* W, float* Vt, size_t vstep, int m, int n, int n1=-1)
 {
-    JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, FLT_MIN, FLT_EPSILON*2);
+    hal::SVD32f(At, astep, W, NULL, astep, Vt, vstep, m, n, n1);
 }

 static void JacobiSVD(double* At, size_t astep, double* W, double* Vt, size_t vstep, int m, int n, int n1=-1)
 {
+    hal::SVD64f(At, astep, W, NULL, astep, Vt, vstep, m, n, n1);
+}
+
+template <typename fptype> static inline int
+decodeSVDParameters(const fptype* U, const fptype* Vt, int m, int n, int n1)
+{
+    int halSVDFlag = 0;
+    if(Vt == NULL)
+        halSVDFlag = CV_HAL_SVD_NO_UV;
+    else if(n1 <= 0 || n1 == n)
+    {
+        halSVDFlag = CV_HAL_SVD_SHORT_UV;
+        if(U == NULL)
+            halSVDFlag |= CV_HAL_SVD_MODIFY_A;
+    }
+    else if(n1 == m)
+    {
+        halSVDFlag = CV_HAL_SVD_FULL_UV;
+        if(U == NULL)
+            halSVDFlag |= CV_HAL_SVD_MODIFY_A;
+    }
+    return halSVDFlag;
+}
+
+void hal::SVD32f(float* At, size_t astep, float* W, float* U, size_t ustep, float* Vt, size_t vstep, int m, int n, int n1)
+{
+    CALL_HAL(SVD32f, cv_hal_SVD32f, At, astep, W, U, ustep, Vt, vstep, m, n, decodeSVDParameters(U, Vt, m, n, n1))
+    JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, FLT_MIN, FLT_EPSILON*2);
+}
+
+void hal::SVD64f(double* At, size_t astep, double* W, double* U, size_t ustep, double* Vt, size_t vstep, int m, int n, int n1)
+{
+    CALL_HAL(SVD64f, cv_hal_SVD64f, At, astep, W, U, ustep, Vt, vstep, m, n, decodeSVDParameters(U, Vt, m, n, n1))
    JacobiSVDImpl_(At, astep, W, Vt, vstep, m, n, !Vt ? 0 : n1 < 0 ? n : n1, DBL_MIN, DBL_EPSILON*10);
 }

@@ -745,7 +778,6 @@ double cv::determinant( InputArray _mat )
            {
                for( int i = 0; i < rows; i++ )
                    result *= a.at<float>(i,i);
-                result = 1./result;
            }
        }
    }
@@ -769,7 +801,6 @@ double cv::determinant( InputArray _mat )
            {
                for( int i = 0; i < rows; i++ )
                    result *= a.at<double>(i,i);
-                result = 1./result;
            }
        }
    }
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
@@ -864,73 +864,39 @@ static bool ocl_gemm( InputArray matA, InputArray matB, double alpha,
    return k.run(2, globalsize, block_size!=1 ? localsize : NULL, false);
 }
 #endif
-}

-void cv::gemm( InputArray matA, InputArray matB, double alpha,
-           InputArray matC, double beta, OutputArray _matD, int flags )
+static void gemmImpl( Mat A, Mat B, double alpha,
+           Mat C, double beta, Mat D, int flags )
 {
-#ifdef HAVE_CLAMDBLAS
-    CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() &&
-        matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes
-        ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags))
-#endif
-
-#ifdef HAVE_OPENCL
-    CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2,
-               ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags))
-#endif
-
    const int block_lin_size = 128;
    const int block_size = block_lin_size * block_lin_size;

    static double zero[] = {0,0,0,0};
    static float zerof[] = {0,0,0,0};

-    Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0 ? matC.getMat() : Mat();
    Size a_size = A.size(), d_size;
    int i, len = 0, type = A.type();

-    CV_Assert( type == B.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
-
    switch( flags & (GEMM_1_T|GEMM_2_T) )
    {
    case 0:
        d_size = Size( B.cols, a_size.height );
        len = B.rows;
-        CV_Assert( a_size.width == len );
        break;
    case 1:
        d_size = Size( B.cols, a_size.width );
        len = B.rows;
-        CV_Assert( a_size.height == len );
        break;
    case 2:
        d_size = Size( B.rows, a_size.height );
        len = B.cols;
-        CV_Assert( a_size.width == len );
        break;
    case 3:
        d_size = Size( B.rows, a_size.width );
        len = B.cols;
-        CV_Assert( a_size.height == len );
        break;
    }

-    if( !C.empty() )
-    {
-        CV_Assert( C.type() == type &&
-            (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) ||
-             ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height)));
-    }
-
-    _matD.create( d_size.height, d_size.width, type );
-    Mat D = _matD.getMat();
-    if( (flags & GEMM_3_T) != 0 && C.data == D.data )
-    {
-        transpose( C, C );
-        flags &= ~GEMM_3_T;
-    }
-
    if( flags == 0 && 2 <= len && len <= 4 && (len == d_size.width || len == d_size.height) )
    {
        if( type == CV_32F )
@@ -1194,8 +1160,7 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
    GEMMSingleMulFunc singleMulFunc;
    GEMMBlockMulFunc blockMulFunc;
    GEMMStoreFunc storeFunc;
-    Mat *matD = &D, tmat;
-    size_t tmat_size = 0;
+    Mat *matD = &D;
    const uchar* Cdata = C.data;
    size_t Cstep = C.data ? (size_t)C.step : 0;
    AutoBuffer<uchar> buf;
@@ -1226,13 +1191,6 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
        storeFunc = (GEMMStoreFunc)GEMMStore_64fc;
    }

-    if( D.data == A.data || D.data == B.data )
-    {
-        tmat_size = (size_t)d_size.width*d_size.height*CV_ELEM_SIZE(type);
-        // Allocate tmat later, once the size of buf is known
-        matD = &tmat;
-    }
-
    if( (d_size.width == 1 || len == 1) && !(flags & GEMM_2_T) && B.isContinuous() )
    {
        b_step = d_size.width == 1 ? 0 : CV_ELEM_SIZE(type);
@@ -1306,10 +1264,6 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
        (d_size.width <= block_lin_size &&
        d_size.height <= block_lin_size && len <= block_lin_size) )
    {
-        if( tmat_size > 0 ) {
-            buf.allocate(tmat_size);
-            tmat = Mat(d_size.height, d_size.width, type, (uchar*)buf );
-        }
        singleMulFunc( A.ptr(), A.step, B.ptr(), b_step, Cdata, Cstep,
                       matD->ptr(), matD->step, a_size, d_size, alpha, beta, flags );
    }
@@ -1369,14 +1323,12 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
            flags &= ~GEMM_1_T;
        }

-        buf.allocate(d_buf_size + b_buf_size + a_buf_size + tmat_size);
+        buf.allocate(d_buf_size + b_buf_size + a_buf_size);
        d_buf = (uchar*)buf;
        b_buf = d_buf + d_buf_size;

        if( is_a_t )
            a_buf = b_buf + b_buf_size;
-        if( tmat_size > 0 )
-            tmat = Mat(d_size.height, d_size.width, type, b_buf + b_buf_size + a_buf_size );

        for( i = 0; i < d_size.height; i += di )
        {
@@ -1455,12 +1407,200 @@ void cv::gemm( InputArray matA, InputArray matB, double alpha,
            }
        }
    }
-
-    if( matD != &D )
-        matD->copyTo(D);
    }
 }

+template <typename fptype>inline static void
+callGemmImpl(const fptype *src1, size_t src1_step, const fptype *src2, size_t src2_step, fptype alpha,
+          const fptype *src3, size_t src3_step, fptype beta, fptype *dst, size_t dst_step, int m_a, int n_a, int n_d, int flags, int type)
+{
+    CV_StaticAssert(GEMM_1_T == CV_HAL_GEMM_1_T, "Incompatible GEMM_1_T flag in HAL");
+    CV_StaticAssert(GEMM_2_T == CV_HAL_GEMM_2_T, "Incompatible GEMM_2_T flag in HAL");
+    CV_StaticAssert(GEMM_3_T == CV_HAL_GEMM_3_T, "Incompatible GEMM_3_T flag in HAL");
+
+    int b_m, b_n, c_m, c_n, m_d;
+
+    if(flags & GEMM_2_T)
+    {
+        b_m = n_d;
+        if(flags & GEMM_1_T )
+        {
+            b_n = m_a;
+            m_d = n_a;
+        }
+        else
+        {
+            b_n = n_a;
+            m_d = m_a;
+        }
+    }
+    else
+    {
+        b_n = n_d;
+        if(flags & GEMM_1_T )
+        {
+            b_m = m_a;
+            m_d = n_a;
+        }
+        else
+        {
+            m_d = m_a;
+            b_m = n_a;
+        }
+    }
+
+    if(flags & GEMM_3_T)
+    {
+        c_m = n_d;
+        c_n = m_d;
+    }
+    else
+    {
+        c_m = m_d;
+        c_n = n_d;
+    }
+
+    Mat A, B, C;
+    if(src1 != NULL)
+        A = Mat(m_a, n_a, type, (void*)src1, src1_step);
+    if(src2 != NULL)
+        B = Mat(b_m, b_n, type, (void*)src2, src2_step);
+    if(src3 != NULL && beta != 0.0)
+        C = Mat(c_m, c_n, type, (void*)src3, src3_step);
+    Mat D(m_d, n_d, type, (void*)dst, dst_step);
+
+    gemmImpl(A, B, alpha, C, beta, D, flags);
+}
+
+}
+
+void cv::hal::gemm32f(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags)
+{
+
+    CALL_HAL(gemm32f, cv_hal_gemm32f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
+    callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32F);
+}
+
+void cv::hal::gemm64f(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags)
+{
+    CALL_HAL(gemm64f, cv_hal_gemm64f, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
+    callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64F);
+}
+
+CV_EXPORTS void cv::hal::gemm32fc(const float* src1, size_t src1_step, const float* src2, size_t src2_step,
+                        float alpha, const float* src3, size_t src3_step, float beta, float* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags)
+{
+    CALL_HAL(gemm32fc, cv_hal_gemm32fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
+    callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_32FC2);
+}
+
+CV_EXPORTS void cv::hal::gemm64fc(const double* src1, size_t src1_step, const double* src2, size_t src2_step,
+                        double alpha, const double* src3, size_t src3_step, double beta, double* dst, size_t dst_step,
+                        int m_a, int n_a, int n_d, int flags)
+{
+    CALL_HAL(gemm64fc, cv_hal_gemm64fc, src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags)
+    callGemmImpl(src1, src1_step, src2, src2_step, alpha, src3, src3_step, beta, dst, dst_step, m_a, n_a, n_d, flags, CV_64FC2);
+}
+
+void cv::gemm( InputArray matA, InputArray matB, double alpha,
+           InputArray matC, double beta, OutputArray _matD, int flags )
+{
+#ifdef HAVE_CLAMDBLAS
+    CV_OCL_RUN(ocl::haveAmdBlas() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2 && _matD.isUMat() &&
+        matA.cols() > 20 && matA.rows() > 20 && matB.cols() > 20, // since it works incorrect for small sizes
+        ocl_gemm_amdblas(matA, matB, alpha, matC, beta, _matD, flags))
+#endif
+
+#ifdef HAVE_OPENCL
+    CV_OCL_RUN(_matD.isUMat() && matA.dims() <= 2 && matB.dims() <= 2 && matC.dims() <= 2,
+               ocl_gemm(matA, matB, alpha, matC, beta, _matD, flags))
+#endif
+
+    Mat A = matA.getMat(), B = matB.getMat(), C = beta != 0.0 ? matC.getMat() : Mat();
+    Size a_size = A.size(), d_size;
+    int len = 0, type = A.type();
+
+    CV_Assert( type == B.type() && (type == CV_32FC1 || type == CV_64FC1 || type == CV_32FC2 || type == CV_64FC2) );
+
+    switch( flags & (GEMM_1_T|GEMM_2_T) )
+    {
+    case 0:
+        d_size = Size( B.cols, a_size.height );
+        len = B.rows;
+        CV_Assert( a_size.width == len );
+        break;
+    case 1:
+        d_size = Size( B.cols, a_size.width );
+        len = B.rows;
+        CV_Assert( a_size.height == len );
+        break;
+    case 2:
+        d_size = Size( B.rows, a_size.height );
+        len = B.cols;
+        CV_Assert( a_size.width == len );
+        break;
+    case 3:
+        d_size = Size( B.rows, a_size.width );
+        len = B.cols;
+        CV_Assert( a_size.height == len );
+        break;
+    }
+
+    if( !C.empty() )
+    {
+        CV_Assert( C.type() == type &&
+            (((flags&GEMM_3_T) == 0 && C.rows == d_size.height && C.cols == d_size.width) ||
+             ((flags&GEMM_3_T) != 0 && C.rows == d_size.width && C.cols == d_size.height)));
+    }
+
+    _matD.create( d_size.height, d_size.width, type );
+    Mat D = _matD.getMat();
+    if( (flags & GEMM_3_T) != 0 && C.data == D.data )
+    {
+        transpose( C, C );
+        flags &= ~GEMM_3_T;
+    }
+
+    Mat *DProxyPtr = &D, DProxy;
+    if( D.data == A.data || D.data == B.data )
+    {
+        DProxy = Mat(d_size.height, d_size.width, D.type());
+        DProxyPtr = &DProxy;
+    }
+
+    if( type == CV_32FC1 )
+        hal::gemm32f(A.ptr<float>(), A.step, B.ptr<float>(), B.step, static_cast<float>(alpha),
+                     C.ptr<float>(), C.step, static_cast<float>(beta),
+                     DProxyPtr->ptr<float>(), DProxyPtr->step,
+                     a_size.height, a_size.width, DProxyPtr->cols, flags);
+    else if( type == CV_64FC1 )
+        hal::gemm64f(A.ptr<double>(), A.step, B.ptr<double>(), B.step, alpha,
+                     C.ptr<double>(), C.step, beta,
+                     DProxyPtr->ptr<double>(), DProxyPtr->step,
+                     a_size.height, a_size.width, DProxyPtr->cols, flags);
+    else if( type == CV_32FC2 )
+        hal::gemm32fc(A.ptr<float>(), A.step, B.ptr<float>(), B.step, static_cast<float>(alpha),
+                      C.ptr<float>(), C.step, static_cast<float>(beta),
+                      DProxyPtr->ptr<float>(), DProxyPtr->step,
+                      a_size.height, a_size.width, DProxyPtr->cols, flags);
+    else
+    {
+        CV_Assert( type == CV_64FC2 );
+        hal::gemm64fc(A.ptr<double>(), A.step, B.ptr<double>(), B.step, alpha,
+                      C.ptr<double>(), C.step, beta,
+                      D.ptr<double>(), D.step,
+                      a_size.height, a_size.width, DProxyPtr->cols, flags);
+    }
+
+    if(DProxyPtr != &D)
+        DProxyPtr->copyTo(D);
+}
+
 /****************************************************************************************\
 *                                        Transform                                       *
 \****************************************************************************************/
--- a/modules/core/src/matrix_decomp.cpp
+++ b/modules/core/src/matrix_decomp.cpp
@@ -89,8 +89,6 @@ LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps)
                for( k = 0; k < n; k++ )
                    b[j*bstep + k] += alpha*b[i*bstep + k];
        }
-
-        A[i*astep + i] = -d;
    }

    if( b )
@@ -101,7 +99,7 @@ LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps)
                _Tp s = b[i*bstep + j];
                for( k = i+1; k < m; k++ )
                    s -= A[i*astep + k]*b[k*bstep + j];
-                b[i*bstep + j] = s*A[i*astep + i];
+                b[i*bstep + j] = s/A[i*astep + i];
            }
    }

@@ -111,13 +109,19 @@ LUImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n, _Tp eps)

 int LU32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
-    return LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
+    int output;
+    CALL_HAL_RET(LU32f, cv_hal_LU32f, output, A, astep, m, b, bstep, n)
+    output = LUImpl(A, astep, m, b, bstep, n, FLT_EPSILON*10);
+    return output;
 }


 int LU64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
-    return LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
+    int output;
+    CALL_HAL_RET(LU64f, cv_hal_LU64f, output, A, astep, m, b, bstep, n)
+    output = LUImpl(A, astep, m, b, bstep, n, DBL_EPSILON*100);
+    return output;
 }

 template<typename _Tp> static inline bool
@@ -193,14 +197,17 @@ CholImpl(_Tp* A, size_t astep, int m, _Tp* b, size_t bstep, int n)
    return true;
 }

-
 bool Cholesky32f(float* A, size_t astep, int m, float* b, size_t bstep, int n)
 {
+    bool output;
+    CALL_HAL_RET(Cholesky32f, cv_hal_Cholesky32f, output, A, astep, m, b, bstep, n)
    return CholImpl(A, astep, m, b, bstep, n);
 }

 bool Cholesky64f(double* A, size_t astep, int m, double* b, size_t bstep, int n)
 {
+    bool output;
+    CALL_HAL_RET(Cholesky64f, cv_hal_Cholesky64f, output, A, astep, m, b, bstep, n)
    return CholImpl(A, astep, m, b, bstep, n);
 }

--- a/modules/imgproc/src/hal_replacement.hpp
+++ b/modules/imgproc/src/hal_replacement.hpp
@@ -309,4 +309,23 @@ inline int hal_ni_warpPerspectve(int src_type, const uchar *src_data, size_t src

 #include "custom_hal.hpp"

+//! @cond IGNORED
+#define CALL_HAL_RET(name, fun, retval, ...) \
+    int res = fun(__VA_ARGS__, &retval); \
+    if (res == CV_HAL_ERROR_OK) \
+        return retval; \
+    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
+        CV_Error_(cv::Error::StsInternal, \
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+
+
+#define CALL_HAL(name, fun, ...) \
+    int res = fun(__VA_ARGS__); \
+    if (res == CV_HAL_ERROR_OK) \
+        return; \
+    else if (res != CV_HAL_ERROR_NOT_IMPLEMENTED) \
+        CV_Error_(cv::Error::StsInternal, \
+            ("HAL implementation " CVAUX_STR(name) " ==> " CVAUX_STR(fun) " returned %d (0x%08x)", res, res));
+//! @endcond
+
 #endif
--- a/modules/video/include/opencv2/video/tracking.hpp
+++ b/modules/video/include/opencv2/video/tracking.hpp
@@ -226,7 +226,7 @@ CV_EXPORTS_W void calcOpticalFlowFarneback( InputArray prev, InputArray next, In
@param dst Second input 2D point set of the same size and the same type as A, or another image.
@param fullAffine If true, the function finds an optimal affine transformation with no additional
 restrictions (6 degrees of freedom). Otherwise, the class of transformations to choose from is
-limited to combinations of translation, rotation, and uniform scaling (5 degrees of freedom).
+limited to combinations of translation, rotation, and uniform scaling (4 degrees of freedom).

 The function finds an optimal affine transform *[A|b]* (a 2 x 3 floating-point matrix) that
 approximates best the affine transformation between: