added gpu::calcHist function
This commit is contained in:
parent
69352e5241
commit
33df5ea07b
@ -1087,6 +1087,11 @@ namespace cv
|
||||
//! Supports CV_8UC4, CV_16UC4, CV_16SC4 and CV_32FC4 source types.
|
||||
//! Output hist[i] will have one row and (levels[i].cols-1) cols and CV_32SC1 type.
|
||||
CV_EXPORTS void histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4], Stream& stream = Stream::Null());
|
||||
|
||||
//! Calculates histogram for 8u one channel image
|
||||
//! Output hist will have one row, 256 cols and CV32SC1 type.
|
||||
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, Stream& stream = Stream::Null());
|
||||
CV_EXPORTS void calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream = Stream::Null());
|
||||
|
||||
//////////////////////////////// StereoBM_GPU ////////////////////////////////
|
||||
|
||||
|
193
modules/gpu/src/cuda/hist.cu
Normal file
193
modules/gpu/src/cuda/hist.cu
Normal file
@ -0,0 +1,193 @@
|
||||
/*M///////////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
|
||||
//
|
||||
// By downloading, copying, installing or using the software you agree to this license.
|
||||
// If you do not agree to this license, do not download, install,
|
||||
// copy or use the software.
|
||||
//
|
||||
//
|
||||
// License Agreement
|
||||
// For Open Source Computer Vision Library
|
||||
//
|
||||
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
|
||||
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
|
||||
// Copyright (C) 1993-2011, NVIDIA Corporation, all rights reserved.
|
||||
// Third party copyrights are property of their respective owners.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without modification,
|
||||
// are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistribution's of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
//
|
||||
// * Redistribution's in binary form must reproduce the above copyright notice,
|
||||
// this list of conditions and the following disclaimer in the documentation
|
||||
// and/or other materials provided with the distribution.
|
||||
//
|
||||
// * The name of the copyright holders may not be used to endorse or promote products
|
||||
// derived from this software without specific prior written permission.
|
||||
//
|
||||
// This software is provided by the copyright holders and contributors "as is" and
|
||||
// any express or bpied warranties, including, but not limited to, the bpied
|
||||
// warranties of merchantability and fitness for a particular purpose are disclaimed.
|
||||
// In no event shall the Intel Corporation or contributors be liable for any direct,
|
||||
// indirect, incidental, special, exemplary, or consequential damages
|
||||
// (including, but not limited to, procurement of substitute goods or services;
|
||||
// loss of use, data, or profits; or business interruption) however caused
|
||||
// and on any theory of liability, whether in contract, strict liability,
|
||||
// or tort (including negligence or otherwise) arising in any way out of
|
||||
// the use of this software, even if advised of the possibility of such damage.
|
||||
//
|
||||
//M*/
|
||||
|
||||
#include "internal_shared.hpp"
|
||||
#include "opencv2/gpu/device/saturate_cast.hpp"
|
||||
|
||||
using namespace cv::gpu;
|
||||
|
||||
using namespace cv::gpu::device;
|
||||
|
||||
#define UINT_BITS 32U
|
||||
|
||||
#define LOG2_WARP_SIZE 5U
|
||||
#define WARP_SIZE (1U << LOG2_WARP_SIZE)
|
||||
|
||||
//Warps == subhistograms per threadblock
|
||||
#define WARP_COUNT 6
|
||||
|
||||
//Threadblock size
|
||||
#define HISTOGRAM256_THREADBLOCK_SIZE (WARP_COUNT * WARP_SIZE)
|
||||
#define HISTOGRAM256_BIN_COUNT 256
|
||||
|
||||
//Shared memory per threadblock
|
||||
#define HISTOGRAM256_THREADBLOCK_MEMORY (WARP_COUNT * HISTOGRAM256_BIN_COUNT)
|
||||
|
||||
#define PARTIAL_HISTOGRAM256_COUNT 240
|
||||
|
||||
#define MERGE_THREADBLOCK_SIZE 256
|
||||
|
||||
#define USE_SMEM_ATOMICS (__CUDA_ARCH__ >= 120)
|
||||
|
||||
namespace cv { namespace gpu { namespace histograms
|
||||
{
|
||||
#if (!USE_SMEM_ATOMICS)
|
||||
|
||||
#define TAG_MASK ( (1U << (UINT_BITS - LOG2_WARP_SIZE)) - 1U )
|
||||
|
||||
__forceinline__ __device__ void addByte(volatile uint* s_WarpHist, uint data, uint threadTag)
|
||||
{
|
||||
uint count;
|
||||
do
|
||||
{
|
||||
count = s_WarpHist[data] & TAG_MASK;
|
||||
count = threadTag | (count + 1);
|
||||
s_WarpHist[data] = count;
|
||||
} while (s_WarpHist[data] != count);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define TAG_MASK 0xFFFFFFFFU
|
||||
|
||||
__forceinline__ __device__ void addByte(uint* s_WarpHist, uint data, uint threadTag)
|
||||
{
|
||||
atomicAdd(s_WarpHist + data, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
__forceinline__ __device__ void addWord(uint* s_WarpHist, uint data, uint tag, uint pos_x, uint cols)
|
||||
{
|
||||
uint x = pos_x << 2;
|
||||
|
||||
if (x + 0 < cols) addByte(s_WarpHist, (data >> 0) & 0xFFU, tag);
|
||||
if (x + 1 < cols) addByte(s_WarpHist, (data >> 8) & 0xFFU, tag);
|
||||
if (x + 2 < cols) addByte(s_WarpHist, (data >> 16) & 0xFFU, tag);
|
||||
if (x + 3 < cols) addByte(s_WarpHist, (data >> 24) & 0xFFU, tag);
|
||||
}
|
||||
|
||||
__global__ void histogram256(PtrStep_<uint> d_Data, uint* d_PartialHistograms, uint dataCount, uint cols)
|
||||
{
|
||||
//Per-warp subhistogram storage
|
||||
__shared__ uint s_Hist[HISTOGRAM256_THREADBLOCK_MEMORY];
|
||||
uint* s_WarpHist= s_Hist + (threadIdx.x >> LOG2_WARP_SIZE) * HISTOGRAM256_BIN_COUNT;
|
||||
|
||||
//Clear shared memory storage for current threadblock before processing
|
||||
#pragma unroll
|
||||
for (uint i = 0; i < (HISTOGRAM256_THREADBLOCK_MEMORY / HISTOGRAM256_THREADBLOCK_SIZE); i++)
|
||||
s_Hist[threadIdx.x + i * HISTOGRAM256_THREADBLOCK_SIZE] = 0;
|
||||
|
||||
//Cycle through the entire data set, update subhistograms for each warp
|
||||
const uint tag = threadIdx.x << (UINT_BITS - LOG2_WARP_SIZE);
|
||||
|
||||
__syncthreads();
|
||||
const uint colsui = d_Data.step / sizeof(uint);
|
||||
for(uint pos = blockIdx.x * blockDim.x + threadIdx.x; pos < dataCount; pos += blockDim.x * gridDim.x)
|
||||
{
|
||||
uint pos_y = pos / colsui;
|
||||
uint pos_x = pos % colsui;
|
||||
uint data = d_Data.ptr(pos_y)[pos_x];
|
||||
addWord(s_WarpHist, data, tag, pos_x, cols);
|
||||
}
|
||||
|
||||
//Merge per-warp histograms into per-block and write to global memory
|
||||
__syncthreads();
|
||||
for(uint bin = threadIdx.x; bin < HISTOGRAM256_BIN_COUNT; bin += HISTOGRAM256_THREADBLOCK_SIZE)
|
||||
{
|
||||
uint sum = 0;
|
||||
|
||||
for (uint i = 0; i < WARP_COUNT; i++)
|
||||
sum += s_Hist[bin + i * HISTOGRAM256_BIN_COUNT] & TAG_MASK;
|
||||
|
||||
d_PartialHistograms[blockIdx.x * HISTOGRAM256_BIN_COUNT + bin] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Merge histogram256() output
|
||||
// Run one threadblock per bin; each threadblock adds up the same bin counter
|
||||
// from every partial histogram. Reads are uncoalesced, but mergeHistogram256
|
||||
// takes only a fraction of total processing time
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__global__ void mergeHistogram256(const uint* d_PartialHistograms, int* d_Histogram)
|
||||
{
|
||||
uint sum = 0;
|
||||
|
||||
#pragma unroll
|
||||
for (uint i = threadIdx.x; i < PARTIAL_HISTOGRAM256_COUNT; i += MERGE_THREADBLOCK_SIZE)
|
||||
sum += d_PartialHistograms[blockIdx.x + i * HISTOGRAM256_BIN_COUNT];
|
||||
|
||||
__shared__ uint data[MERGE_THREADBLOCK_SIZE];
|
||||
data[threadIdx.x] = sum;
|
||||
|
||||
for (uint stride = MERGE_THREADBLOCK_SIZE / 2; stride > 0; stride >>= 1)
|
||||
{
|
||||
__syncthreads();
|
||||
if(threadIdx.x < stride)
|
||||
data[threadIdx.x] += data[threadIdx.x + stride];
|
||||
}
|
||||
|
||||
if(threadIdx.x == 0)
|
||||
d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
|
||||
}
|
||||
|
||||
void histogram256_gpu(DevMem2D src, int* hist, uint* buf, cudaStream_t stream)
|
||||
{
|
||||
histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
|
||||
DevMem2D_<uint>(src),
|
||||
buf,
|
||||
src.rows * src.step / sizeof(uint),
|
||||
src.cols);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
mergeHistogram256<<<HISTOGRAM256_BIN_COUNT, MERGE_THREADBLOCK_SIZE, 0, stream>>>(buf, hist);
|
||||
|
||||
cudaSafeCall( cudaGetLastError() );
|
||||
|
||||
if (stream == 0)
|
||||
cudaSafeCall( cudaDeviceSynchronize() );
|
||||
}
|
||||
}}}
|
@ -71,6 +71,8 @@ void cv::gpu::histEven(const GpuMat&, GpuMat&, int, int, int, Stream&) { throw_n
|
||||
void cv::gpu::histEven(const GpuMat&, GpuMat*, int*, int*, int*, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::histRange(const GpuMat&, GpuMat&, const GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::calcHist(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::calcHist(const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
|
||||
void cv::gpu::cornerHarris(const GpuMat&, GpuMat&, int, int, double, int) { throw_nogpu(); }
|
||||
void cv::gpu::cornerMinEigenVal(const GpuMat&, GpuMat&, int, int, int) { throw_nogpu(); }
|
||||
void cv::gpu::mulSpectrums(const GpuMat&, const GpuMat&, GpuMat&, int, bool) { throw_nogpu(); }
|
||||
@ -1037,6 +1039,33 @@ void cv::gpu::histRange(const GpuMat& src, GpuMat hist[4], const GpuMat levels[4
|
||||
hist_callers[src.depth()](src, hist, levels, StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
namespace cv { namespace gpu { namespace histograms
|
||||
{
|
||||
void histogram256_gpu(DevMem2D src, int* hist, unsigned int* buf, cudaStream_t stream);
|
||||
|
||||
const int PARTIAL_HISTOGRAM256_COUNT = 240;
|
||||
const int HISTOGRAM256_BIN_COUNT = 256;
|
||||
}}}
|
||||
|
||||
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, Stream& stream)
|
||||
{
|
||||
GpuMat buf;
|
||||
calcHist(src, hist, buf, stream);
|
||||
}
|
||||
|
||||
void cv::gpu::calcHist(const GpuMat& src, GpuMat& hist, GpuMat& buf, Stream& stream)
|
||||
{
|
||||
using namespace cv::gpu::histograms;
|
||||
|
||||
CV_Assert(src.type() == CV_8UC1);
|
||||
|
||||
hist.create(1, 256, CV_32SC1);
|
||||
|
||||
ensureSizeIsEnough(1, PARTIAL_HISTOGRAM256_COUNT * HISTOGRAM256_BIN_COUNT, CV_32SC1, buf);
|
||||
|
||||
histogram256_gpu(src, hist.ptr<int>(), buf.ptr<unsigned int>(), StreamAccessor::getStream(stream));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////
|
||||
// cornerHarris & minEgenVal
|
||||
|
||||
|
@ -967,7 +967,7 @@ INSTANTIATE_TEST_CASE_P(ImgProc, CvtColor, testing::Combine(
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// histograms
|
||||
|
||||
struct Histograms : testing::TestWithParam<cv::gpu::DeviceInfo>
|
||||
struct HistEven : testing::TestWithParam<cv::gpu::DeviceInfo>
|
||||
{
|
||||
static cv::Mat hsv;
|
||||
|
||||
@ -1014,9 +1014,9 @@ struct Histograms : testing::TestWithParam<cv::gpu::DeviceInfo>
|
||||
}
|
||||
};
|
||||
|
||||
cv::Mat Histograms::hsv;
|
||||
cv::Mat HistEven::hsv;
|
||||
|
||||
TEST_P(Histograms, Accuracy)
|
||||
TEST_P(HistEven, Accuracy)
|
||||
{
|
||||
ASSERT_TRUE(!hsv.empty());
|
||||
|
||||
@ -1038,7 +1038,61 @@ TEST_P(Histograms, Accuracy)
|
||||
EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ImgProc, Histograms, testing::ValuesIn(devices()));
|
||||
INSTANTIATE_TEST_CASE_P(ImgProc, HistEven, testing::ValuesIn(devices()));
|
||||
|
||||
struct CalcHist : testing::TestWithParam<cv::gpu::DeviceInfo>
|
||||
{
|
||||
cv::gpu::DeviceInfo devInfo;
|
||||
|
||||
cv::Size size;
|
||||
cv::Mat src;
|
||||
cv::Mat hist_gold;
|
||||
|
||||
virtual void SetUp()
|
||||
{
|
||||
devInfo = GetParam();
|
||||
|
||||
cv::gpu::setDevice(devInfo.deviceID());
|
||||
|
||||
cv::RNG& rng = cvtest::TS::ptr()->get_rng();
|
||||
|
||||
size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
|
||||
|
||||
src = cvtest::randomMat(rng, size, CV_8UC1, 0, 255, false);
|
||||
|
||||
hist_gold.create(1, 256, CV_32SC1);
|
||||
hist_gold.setTo(cv::Scalar::all(0));
|
||||
|
||||
int* hist = hist_gold.ptr<int>();
|
||||
for (int y = 0; y < src.rows; ++y)
|
||||
{
|
||||
const uchar* src_row = src.ptr(y);
|
||||
|
||||
for (int x = 0; x < src.cols; ++x)
|
||||
++hist[src_row[x]];
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
TEST_P(CalcHist, Accuracy)
|
||||
{
|
||||
PRINT_PARAM(devInfo);
|
||||
PRINT_PARAM(size);
|
||||
|
||||
cv::Mat hist;
|
||||
|
||||
ASSERT_NO_THROW(
|
||||
cv::gpu::GpuMat gpuHist;
|
||||
|
||||
cv::gpu::calcHist(cv::gpu::GpuMat(src), gpuHist);
|
||||
|
||||
gpuHist.download(hist);
|
||||
);
|
||||
|
||||
EXPECT_MAT_NEAR(hist_gold, hist, 0.0);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(ImgProc, CalcHist, testing::ValuesIn(devices()));
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
// cornerHarris
|
||||
|
@ -875,7 +875,7 @@ TEST(pyrDown)
|
||||
{
|
||||
SUBTEST << "size " << size;
|
||||
|
||||
Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256);
|
||||
Mat src; gen(src, size, size, CV_16SC3, 0, 256);
|
||||
Mat dst(Size(src.cols / 2, src.rows / 2), src.type());
|
||||
|
||||
CPU_ON;
|
||||
@ -899,7 +899,7 @@ TEST(pyrUp)
|
||||
{
|
||||
SUBTEST << "size " << size;
|
||||
|
||||
Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256);
|
||||
Mat src; gen(src, size, size, CV_16SC3, 0, 256);
|
||||
Mat dst(Size(src.cols * 2, src.rows * 2), src.type());
|
||||
|
||||
CPU_ON;
|
||||
|
Loading…
Reference in New Issue
Block a user