diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp
index 2af921f1f..5ff90c9b2 100644
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1007,11 +1007,15 @@ namespace cv
             GpuMat table_space;
         };
 
+
+        //////////////// HOG (Histogram-of-Oriented-Gradients) Descriptor and Object Detector //////////////
+
         struct CV_EXPORTS HOGDescriptor
         {
         public:
             enum { DEFAULT_WIN_SIGMA = -1 };
             enum { DEFAULT_NLEVELS = 64 };
+            enum { DESCR_FORMAT_ROW_BY_ROW, DESCR_FORMAT_COL_BY_COL };
 
             HOGDescriptor(Size win_size=Size(64, 128), Size block_size=Size(16, 16), 
                           Size block_stride=Size(8, 8), Size cell_size=Size(8, 8), 
@@ -1029,13 +1033,14 @@ namespace cv
             void setSVMDetector(const vector<float>& detector);
             bool checkDetectorSize() const;
 
-            void computeBlockHistograms(const GpuMat& img);
             void detect(const GpuMat& img, vector<Point>& found_locations, double hit_threshold=0, 
                         Size win_stride=Size(), Size padding=Size());
             void detectMultiScale(const GpuMat& img, vector<Rect>& found_locations, 
                                   double hit_threshold=0, Size win_stride=Size(), Size padding=Size(),
                                   double scale0=1.05, int group_threshold=2);
-            void getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors);
+
+            void getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, 
+                                int descr_format=DESCR_FORMAT_COL_BY_COL);
 
             Size win_size;
             Size block_size;
@@ -1044,9 +1049,17 @@ namespace cv
             int nbins;
             double win_sigma;
             double threshold_L2hys;
-            bool gamma_correction;
             int nlevels;
 
+        protected:
+            void computeBlockHistograms(const GpuMat& img);
+            void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
+
+            static int numPartsWithin(int size, int part_size, int stride);
+            static Size numPartsWithin(Size size, Size part_size, Size stride);
+
+            bool gamma_correction;
+
             // Coefficients of the separating plane
             float free_coef;
             GpuMat detector;
@@ -1058,13 +1071,8 @@ namespace cv
             // Results of the last histogram evaluation step
             GpuMat block_hists;
 
-        private:
-            static int numPartsWithin(int size, int part_size, int stride);
-            static Size numPartsWithin(Size size, Size part_size, Size stride);
-
-            void computeGradient(const GpuMat& img, GpuMat& grad, GpuMat& qangle);
-
-            GpuMat grad, qangle;
+            // Gradients conputation results
+            GpuMat grad, qangle;            
         };    
     }
 
diff --git a/modules/gpu/src/cuda/hog.cu b/modules/gpu/src/cuda/hog.cu
index 3dc7147b6..b1fee4c3c 100644
--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -428,9 +428,9 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
 
 
 template <int nthreads>
-__global__ void extract_descriptors_kernel(const int img_win_width, const int img_block_width, 
-                                           const int win_block_stride_x, const int win_block_stride_y,
-                                           const float* block_hists, PtrElemStepf descriptors)
+__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, 
+                                              const int win_block_stride_y, const float* block_hists, 
+                                              PtrElemStepf descriptors)
 {
     // Get left top corner of the window in src
     const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
@@ -449,9 +449,9 @@ __global__ void extract_descriptors_kernel(const int img_win_width, const int im
 }
 
 
-void extract_descriptors(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                         int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                         DevMem2Df descriptors)
+void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                            DevMem2Df descriptors)
 {
     const int nthreads = 256;
 
@@ -464,9 +464,56 @@ void extract_descriptors(int win_height, int win_width, int block_stride_y, int
 
     int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
                           block_stride_x;
-    extract_descriptors_kernel<nthreads><<<grid, threads>>>(
-        img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 
-        block_hists, descriptors);
+    extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
+        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
+    cudaSafeCall(cudaThreadSynchronize());
+}
+
+
+template <int nthreads>
+__global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, 
+                                              const int win_block_stride_y, const float* block_hists, 
+                                              PtrElemStepf descriptors)
+{
+    // Get left top corner of the window in src
+    const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
+                                       blockIdx.x * win_block_stride_x) * cblock_hist_size;
+
+    // Get left top corner of the window in dst
+    float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);
+
+    // Copy elements from src to dst
+    for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
+    {
+        int block_idx = i / cblock_hist_size;
+        int idx_in_block = i - block_idx * cblock_hist_size;
+
+        int y = block_idx / cnblocks_win_x;
+        int x = block_idx - y * cnblocks_win_x;
+
+        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] 
+            = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
+    }
+}
+
+
+void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                            DevMem2Df descriptors)
+{
+    const int nthreads = 256;
+
+    int win_block_stride_x = win_stride_x / block_stride_x;
+    int win_block_stride_y = win_stride_y / block_stride_y;
+    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
+    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
+    dim3 threads(nthreads, 1);
+    dim3 grid(img_win_width, img_win_height);
+
+    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
+                          block_stride_x;
+    extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
+        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
     cudaSafeCall(cudaThreadSynchronize());
 }
 
diff --git a/modules/gpu/src/hog.cpp b/modules/gpu/src/hog.cpp
index 7f0b3eaec..ad4b66e21 100644
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
@@ -50,11 +50,9 @@ size_t cv::gpu::HOGDescriptor::getBlockHistogramSize() const { throw_nogpu(); re
 double cv::gpu::HOGDescriptor::getWinSigma() const { throw_nogpu(); return 0; }
 bool cv::gpu::HOGDescriptor::checkDetectorSize() const { throw_nogpu(); return false; }
 void cv::gpu::HOGDescriptor::setSVMDetector(const vector<float>&) { throw_nogpu(); }
-void cv::gpu::HOGDescriptor::computeGradient(const GpuMat&, GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::HOGDescriptor::computeBlockHistograms(const GpuMat&) { throw_nogpu(); }
 void cv::gpu::HOGDescriptor::detect(const GpuMat&, vector<Point>&, double, Size, Size) { throw_nogpu(); }
 void cv::gpu::HOGDescriptor::detectMultiScale(const GpuMat&, vector<Rect>&, double, Size, Size, double, int) { throw_nogpu(); }
-void cv::gpu::HOGDescriptor::getDescriptors(const GpuMat&, Size, GpuMat&) { throw_nogpu(); }
+void cv::gpu::HOGDescriptor::getDescriptors(const GpuMat&, Size, GpuMat&, int) { throw_nogpu(); }
 std::vector<float> cv::gpu::HOGDescriptor::getDefaultPeopleDetector() { throw_nogpu(); return std::vector<float>(); }
 std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector_48x96() { throw_nogpu(); return std::vector<float>(); }
 std::vector<float> cv::gpu::HOGDescriptor::getPeopleDetector_64x128() { throw_nogpu(); return std::vector<float>(); }
@@ -78,9 +76,12 @@ void classify_hists(int win_height, int win_width, int block_stride_y,
                     int width, float* block_hists, float* coefs, float free_coef, 
                     float threshold, unsigned char* labels);
 
-void extract_descriptors(int win_height, int win_width, int block_stride_y, int block_stride_x, 
-                         int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
-                         cv::gpu::DevMem2Df descriptors);
+void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                            cv::gpu::DevMem2Df descriptors);
+void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
+                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
+                            cv::gpu::DevMem2Df descriptors);
 
 void compute_gradients_8UC1(int nbins, int height, int width, const cv::gpu::DevMem2D& img, 
                             float angle_scale, cv::gpu::DevMem2Df grad, cv::gpu::DevMem2D qangle);
@@ -218,7 +219,7 @@ void cv::gpu::HOGDescriptor::computeBlockHistograms(const GpuMat& img)
 }
 
 
-void cv::gpu::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors)
+void cv::gpu::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride, GpuMat& descriptors, int descr_format)
 {
     CV_Assert(win_stride.width % block_stride.width == 0 &&
               win_stride.height % block_stride.height == 0);
@@ -231,9 +232,21 @@ void cv::gpu::HOGDescriptor::getDescriptors(const GpuMat& img, Size win_stride,
 
     descriptors.create(wins_per_img.area(), blocks_per_win.area() * block_hist_size, CV_32F);
 
-    hog::extract_descriptors(win_size.height, win_size.width, block_stride.height, block_stride.width, 
-                             win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(), 
-                             descriptors);
+    switch (descr_format)
+    {
+    case DESCR_FORMAT_ROW_BY_ROW:
+        hog::extract_descrs_by_rows(win_size.height, win_size.width, block_stride.height, block_stride.width, 
+                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(), 
+                                    descriptors);
+        break;
+    case DESCR_FORMAT_COL_BY_COL:
+        hog::extract_descrs_by_cols(win_size.height, win_size.width, block_stride.height, block_stride.width, 
+                                    win_stride.height, win_stride.width, img.rows, img.cols, block_hists.ptr<float>(), 
+                                    descriptors);
+        break;
+    default:
+        CV_Error(CV_StsBadArg, "Unknown descriptor format");
+    }
 }
 
 
diff --git a/tests/gpu/src/hog.cpp b/tests/gpu/src/hog.cpp
index b4366b5ab..69c7c2be5 100644
--- a/tests/gpu/src/hog.cpp
+++ b/tests/gpu/src/hog.cpp
@@ -51,9 +51,9 @@ using namespace std;
     ts->set_failed_test_info(err); \
     return; }
 
-struct CV_GpuHogDetectionTest: public CvTest 
+struct CV_GpuHogDetectionTest: public CvTest, public cv::gpu::HOGDescriptor
 {
-    CV_GpuHogDetectionTest(): CvTest( "GPU-HOG-detect", "HOGDescriptorDetection" ) {}
+    CV_GpuHogDetectionTest(): CvTest("GPU-HOG-detect", "HOGDescriptorDetection") {}
 
     void run(int) 
     {       
@@ -141,54 +141,53 @@ struct CV_GpuHogDetectionTest: public CvTest
     {
         cv::gpu::GpuMat d_img(img);
 
-        cv::gpu::HOGDescriptor hog;
-        hog.setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
+        setSVMDetector(cv::gpu::HOGDescriptor::getDefaultPeopleDetector());
         //cpu detector may be updated soon
         //hog.setSVMDetector(cv::HOGDescriptor::getDefaultPeopleDetector());
 
         std::vector<cv::Point> locations;
 
         // Test detect
-        hog.detect(d_img, locations, 0);
+        detect(d_img, locations, 0);
 
 #ifdef DUMP
-        dump(hog.block_hists, locations);
+        dump(block_hists, locations);
 #else
-        compare(hog.block_hists, locations);
+        compare(block_hists, locations);
 #endif
 
         // Test detect on smaller image
         cv::gpu::GpuMat d_img2;
         cv::gpu::resize(d_img, d_img2, cv::Size(d_img.cols / 2, d_img.rows / 2)); 
-        hog.detect(d_img2, locations, 0);
+        detect(d_img2, locations, 0);
 
 #ifdef DUMP
-        dump(hog.block_hists, locations);
+        dump(block_hists, locations);
 #else
-        compare(hog.block_hists, locations);
+        compare(block_hists, locations);
 #endif
 
         // Test detect on greater image
         cv::gpu::resize(d_img, d_img2, cv::Size(d_img.cols * 2, d_img.rows * 2)); 
-        hog.detect(d_img2, locations, 0);
+        detect(d_img2, locations, 0);
         
 #ifdef DUMP
-        dump(hog.block_hists, locations);
+        dump(block_hists, locations);
 #else
-        compare(hog.block_hists, locations);
+        compare(block_hists, locations);
 #endif
 
         // Test detectMultiScale
         std::vector<cv::Rect> rects;
         size_t nrects;
-        hog.detectMultiScale(d_img, rects, 0, cv::Size(8, 8), cv::Size(), 1.05, 2);
+        detectMultiScale(d_img, rects, 0, cv::Size(8, 8), cv::Size(), 1.05, 2);
 
 #ifdef DUMP
         nrects = rects.size();
         f.write((char*)&nrects, sizeof(nrects));
         for (size_t i = 0; i < rects.size(); ++i)
             f.write((char*)&rects[i], sizeof(rects[i]));
-        dump(hog.block_hists, std::vector<cv::Point>());
+        dump(block_hists, std::vector<cv::Point>());
 #else
         f.read((char*)&nrects, sizeof(nrects));
         CHECK(nrects == rects.size(), CvTS::FAIL_INVALID_OUTPUT)
@@ -198,7 +197,7 @@ struct CV_GpuHogDetectionTest: public CvTest
             f.read((char*)&rect, sizeof(rect));
             CHECK(rect == rects[i], CvTS::FAIL_INVALID_OUTPUT);
         }
-        compare(hog.block_hists, std::vector<cv::Point>());
+        compare(block_hists, std::vector<cv::Point>());
 #endif
     }
 
@@ -211,9 +210,10 @@ struct CV_GpuHogDetectionTest: public CvTest
 } gpu_hog_detection_test;
 
 
-struct CV_GpuHogGetDescriptorsTest: public CvTest 
+struct CV_GpuHogGetDescriptorsTest: public CvTest, public cv::gpu::HOGDescriptor
 {
-    CV_GpuHogGetDescriptorsTest(): CvTest("GPU-HOG-getDescriptors", "HOGDescriptorGetDescriptors") {}
+    CV_GpuHogGetDescriptorsTest(): 
+        CvTest("GPU-HOG-getDescriptors", "HOGDescriptorGetDescriptors"), HOGDescriptor(cv::Size(64, 128)) {}
 
     void run(int)
     {
@@ -228,12 +228,11 @@ struct CV_GpuHogGetDescriptorsTest: public CvTest
             cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
             cv::gpu::GpuMat d_img(img);
 
-            cv::Size win_size(64, 128);
-            cv::gpu::HOGDescriptor hog(win_size);
 
             // Convert train images into feature vectors (train table)
-            cv::gpu::GpuMat descriptors;
-            hog.getDescriptors(d_img, win_size, descriptors);
+            cv::gpu::GpuMat descriptors, descriptors_by_cols;
+            getDescriptors(d_img, win_size, descriptors, DESCR_FORMAT_ROW_BY_ROW);
+            getDescriptors(d_img, win_size, descriptors_by_cols, DESCR_FORMAT_COL_BY_COL);
 
             // Check size of the result train table
             wins_per_img_x = 3;
@@ -245,6 +244,20 @@ struct CV_GpuHogGetDescriptorsTest: public CvTest
                                                     wins_per_img_x * wins_per_img_y);                                                
             CHECK(descriptors.size() == descr_size_expected, CvTS::FAIL_INVALID_OUTPUT);
 
+            // Check both formats of output descriptors are handled correctly
+            cv::Mat dr(descriptors);
+            cv::Mat dc(descriptors_by_cols);
+            for (int i = 0; i < wins_per_img_x * wins_per_img_y; ++i)
+            {
+                const float* l = dr.rowRange(i, i + 1).ptr<float>();
+                const float* r = dc.rowRange(i, i + 1).ptr<float>();
+                for (int y = 0; y < blocks_per_win_y; ++y)
+                    for (int x = 0; x < blocks_per_win_x; ++x)
+                        for (int k = 0; k < block_hist_size; ++k)
+                            CHECK(l[(y * blocks_per_win_x + x) * block_hist_size + k] == 
+                                  r[(x * blocks_per_win_y + y) * block_hist_size + k], CvTS::FAIL_INVALID_OUTPUT);
+            }
+
             /* Now we want to extract the same feature vectors, but from single images. NOTE: results will 
             be defferent, due to border values interpolation. Using of many small images is slower, however we 
             wont't call getDescriptors and will use computeBlockHistograms instead of. computeBlockHistograms 
@@ -253,39 +266,39 @@ struct CV_GpuHogGetDescriptorsTest: public CvTest
             img_rgb = cv::imread(std::string(ts->get_data_path()) + "hog/positive1.png");
             CHECK(!img_rgb.empty(), CvTS::FAIL_MISSING_TEST_DATA);
             cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
-            hog.computeBlockHistograms(cv::gpu::GpuMat(img));
+            computeBlockHistograms(cv::gpu::GpuMat(img));
             // Everything is fine with interpolation for left top subimage
-            CHECK(cv::norm(hog.block_hists, descriptors.rowRange(0, 1)) == 0.f, CvTS::FAIL_INVALID_OUTPUT);
+            CHECK(cv::norm(block_hists, descriptors.rowRange(0, 1)) == 0.f, CvTS::FAIL_INVALID_OUTPUT);
 
             img_rgb = cv::imread(std::string(ts->get_data_path()) + "hog/positive2.png");
             CHECK(!img_rgb.empty(), CvTS::FAIL_MISSING_TEST_DATA);
             cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
-            hog.computeBlockHistograms(cv::gpu::GpuMat(img));
-            compare_inner_parts(hog.block_hists, descriptors.rowRange(1, 2));
+            computeBlockHistograms(cv::gpu::GpuMat(img));
+            compare_inner_parts(block_hists, descriptors.rowRange(1, 2));
 
             img_rgb = cv::imread(std::string(ts->get_data_path()) + "hog/negative1.png");
             CHECK(!img_rgb.empty(), CvTS::FAIL_MISSING_TEST_DATA);
             cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
-            hog.computeBlockHistograms(cv::gpu::GpuMat(img));
-            compare_inner_parts(hog.block_hists, descriptors.rowRange(2, 3));
+            computeBlockHistograms(cv::gpu::GpuMat(img));
+            compare_inner_parts(block_hists, descriptors.rowRange(2, 3));
 
             img_rgb = cv::imread(std::string(ts->get_data_path()) + "hog/negative2.png");
             CHECK(!img_rgb.empty(), CvTS::FAIL_MISSING_TEST_DATA);
             cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
-            hog.computeBlockHistograms(cv::gpu::GpuMat(img));
-            compare_inner_parts(hog.block_hists, descriptors.rowRange(3, 4));
+            computeBlockHistograms(cv::gpu::GpuMat(img));
+            compare_inner_parts(block_hists, descriptors.rowRange(3, 4));
 
             img_rgb = cv::imread(std::string(ts->get_data_path()) + "hog/positive3.png");
             CHECK(!img_rgb.empty(), CvTS::FAIL_MISSING_TEST_DATA);
             cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
-            hog.computeBlockHistograms(cv::gpu::GpuMat(img));
-            compare_inner_parts(hog.block_hists, descriptors.rowRange(4, 5));
+            computeBlockHistograms(cv::gpu::GpuMat(img));
+            compare_inner_parts(block_hists, descriptors.rowRange(4, 5));
 
             img_rgb = cv::imread(std::string(ts->get_data_path()) + "hog/negative3.png");
             CHECK(!img_rgb.empty(), CvTS::FAIL_MISSING_TEST_DATA);
             cv::cvtColor(img_rgb, img, CV_BGR2BGRA);
-            hog.computeBlockHistograms(cv::gpu::GpuMat(img));
-            compare_inner_parts(hog.block_hists, descriptors.rowRange(5, 6));
+            computeBlockHistograms(cv::gpu::GpuMat(img));
+            compare_inner_parts(block_hists, descriptors.rowRange(5, 6));
         }
         catch (const cv::Exception& e)
         {