fixed several bugs in gpu arithm functions

refactored tests for them
This commit is contained in:
Vladislav Vinogradov
2012-03-19 14:18:12 +00:00
parent f58c40bfab
commit 844bdea5ac
5 changed files with 1713 additions and 892 deletions

View File

@@ -47,7 +47,7 @@
#include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
namespace cv { namespace gpu { namespace device
namespace cv { namespace gpu { namespace device
{
//////////////////////////////////////////////////////////////////////////
// add
@@ -684,7 +684,7 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ uchar4 operator ()(uchar4 a, float b) const
{
return b != 0 ? make_uchar4(saturate_cast<uchar>(a.x / b), saturate_cast<uchar>(a.y / b),
saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b))
saturate_cast<uchar>(a.z / b), saturate_cast<uchar>(a.w / b))
: make_uchar4(0,0,0,0);
}
};
@@ -706,8 +706,8 @@ namespace cv { namespace gpu { namespace device
{
__device__ __forceinline__ short4 operator ()(short4 a, float b) const
{
return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<uchar>(a.y / b),
saturate_cast<short>(a.z / b), saturate_cast<uchar>(a.w / b))
return b != 0 ? make_short4(saturate_cast<short>(a.x / b), saturate_cast<short>(a.y / b),
saturate_cast<short>(a.z / b), saturate_cast<short>(a.w / b))
: make_short4(0,0,0,0);
}
};
@@ -1106,10 +1106,10 @@ namespace cv { namespace gpu { namespace device
//template void absdiff_gpu<uchar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
template void absdiff_gpu<schar >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
//template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
//template void absdiff_gpu<ushort>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
template void absdiff_gpu<short >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
template void absdiff_gpu<int >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
//template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
template void absdiff_gpu<int >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
//template void absdiff_gpu<float >(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
template void absdiff_gpu<double>(const DevMem2Db& src1, double src2, const DevMem2Db& dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////////////////
@@ -1251,7 +1251,7 @@ namespace cv { namespace gpu { namespace device
template <typename T>
struct UnOp<T, UN_OP_NOT>
{
{
static __device__ __forceinline__ T call(T v) { return ~v; }
};
@@ -1262,7 +1262,7 @@ namespace cv { namespace gpu { namespace device
const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (y < rows)
if (y < rows)
{
uchar* dst_ptr = dst.ptr(y) + x;
const uchar* src_ptr = src.ptr(y) + x;
@@ -1283,29 +1283,29 @@ namespace cv { namespace gpu { namespace device
template <int opid>
void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst,
void bitwiseUnOp(int rows, int width, const PtrStepb src, PtrStepb dst,
cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(width, threads.x * sizeof(uint)),
dim3 grid(divUp(width, threads.x * sizeof(uint)),
divUp(rows, threads.y));
bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, int opid>
__global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src,
__global__ void bitwiseUnOpKernel(int rows, int cols, int cn, const PtrStepb src,
const PtrStepb mask, PtrStepb dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < cols && y < rows && mask.ptr(y)[x / cn])
if (x < cols && y < rows && mask.ptr(y)[x / cn])
{
T* dst_row = (T*)dst.ptr(y);
const T* src_row = (const T*)src.ptr(y);
@@ -1316,21 +1316,21 @@ namespace cv { namespace gpu { namespace device
template <typename T, int opid>
void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src,
void bitwiseUnOp(int rows, int cols, int cn, const PtrStepb src,
const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
{
dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst);
bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn,
void bitwiseNotCaller(int rows, int cols, size_t elem_size1, int cn,
const PtrStepb src, PtrStepb dst, cudaStream_t stream)
{
bitwiseUnOp<UN_OP_NOT>(rows, static_cast<int>(cols * elem_size1 * cn), src, dst, stream);
@@ -1338,7 +1338,7 @@ namespace cv { namespace gpu { namespace device
template <typename T>
void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src,
void bitwiseMaskNotCaller(int rows, int cols, int cn, const PtrStepb src,
const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
{
bitwiseUnOp<T, UN_OP_NOT>(rows, cols * cn, cn, src, mask, dst, stream);
@@ -1359,32 +1359,32 @@ namespace cv { namespace gpu { namespace device
template <typename T>
struct BinOp<T, BIN_OP_OR>
{
static __device__ __forceinline__ T call(T a, T b) { return a | b; }
{
static __device__ __forceinline__ T call(T a, T b) { return a | b; }
};
template <typename T>
struct BinOp<T, BIN_OP_AND>
{
static __device__ __forceinline__ T call(T a, T b) { return a & b; }
{
static __device__ __forceinline__ T call(T a, T b) { return a & b; }
};
template <typename T>
struct BinOp<T, BIN_OP_XOR>
{
static __device__ __forceinline__ T call(T a, T b) { return a ^ b; }
{
static __device__ __forceinline__ T call(T a, T b) { return a ^ b; }
};
template <int opid>
__global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1,
__global__ void bitwiseBinOpKernel(int rows, int width, const PtrStepb src1,
const PtrStepb src2, PtrStepb dst)
{
const int x = (blockDim.x * blockIdx.x + threadIdx.x) * 4;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (y < rows)
if (y < rows)
{
uchar* dst_ptr = dst.ptr(y) + x;
const uchar* src1_ptr = src1.ptr(y) + x;
@@ -1407,7 +1407,7 @@ namespace cv { namespace gpu { namespace device
template <int opid>
void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2,
void bitwiseBinOp(int rows, int width, const PtrStepb src1, const PtrStepb src2,
PtrStepb dst, cudaStream_t stream)
{
dim3 threads(16, 16);
@@ -1416,20 +1416,20 @@ namespace cv { namespace gpu { namespace device
bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T, int opid>
__global__ void bitwiseBinOpKernel(
int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
const PtrStepb mask, PtrStepb dst)
{
const int x = blockDim.x * blockIdx.x + threadIdx.x;
const int y = blockDim.y * blockIdx.y + threadIdx.y;
if (x < cols && y < rows && mask.ptr(y)[x / cn])
if (x < cols && y < rows && mask.ptr(y)[x / cn])
{
T* dst_row = (T*)dst.ptr(y);
const T* src1_row = (const T*)src1.ptr(y);
@@ -1441,7 +1441,7 @@ namespace cv { namespace gpu { namespace device
template <typename T, int opid>
void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
void bitwiseBinOp(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
{
dim3 threads(16, 16);
@@ -1450,12 +1450,12 @@ namespace cv { namespace gpu { namespace device
bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
void bitwiseOrCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
{
bitwiseBinOp<BIN_OP_OR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
@@ -1463,7 +1463,7 @@ namespace cv { namespace gpu { namespace device
template <typename T>
void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
void bitwiseMaskOrCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
{
bitwiseBinOp<T, BIN_OP_OR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
@@ -1474,7 +1474,7 @@ namespace cv { namespace gpu { namespace device
template void bitwiseMaskOrCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
void bitwiseAndCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
{
bitwiseBinOp<BIN_OP_AND>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
@@ -1482,7 +1482,7 @@ namespace cv { namespace gpu { namespace device
template <typename T>
void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
void bitwiseMaskAndCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
{
bitwiseBinOp<T, BIN_OP_AND>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
@@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace device
template void bitwiseMaskAndCaller<uint>(int, int, int, const PtrStepb, const PtrStepb, const PtrStepb, PtrStepb, cudaStream_t);
void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
void bitwiseXorCaller(int rows, int cols, size_t elem_size1, int cn, const PtrStepb src1,
const PtrStepb src2, PtrStepb dst, cudaStream_t stream)
{
bitwiseBinOp<BIN_OP_XOR>(rows, static_cast<int>(cols * elem_size1 * cn), src1, src2, dst, stream);
@@ -1501,7 +1501,7 @@ namespace cv { namespace gpu { namespace device
template <typename T>
void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
void bitwiseMaskXorCaller(int rows, int cols, int cn, const PtrStepb src1, const PtrStepb src2,
const PtrStepb mask, PtrStepb dst, cudaStream_t stream)
{
bitwiseBinOp<T, BIN_OP_XOR>(rows, cols * cn, cn, src1, src2, mask, dst, stream);
@@ -1546,7 +1546,7 @@ namespace cv { namespace gpu { namespace device
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
cv::gpu::device::transform(src1, src2, dst, minimum<T>(), WithOutMask(), stream);
cv::gpu::device::transform(src1, src2, dst, minimum<T>(), WithOutMask(), stream);
}
template void min_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
@@ -1560,7 +1560,7 @@ namespace cv { namespace gpu { namespace device
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
cv::gpu::device::transform(src1, src2, dst, maximum<T>(), WithOutMask(), stream);
cv::gpu::device::transform(src1, src2, dst, maximum<T>(), WithOutMask(), stream);
}
template void max_gpu<uchar >(const DevMem2Db& src1, const DevMem2Db& src2, const DevMem2Db& dst, cudaStream_t stream);
@@ -1574,7 +1574,7 @@ namespace cv { namespace gpu { namespace device
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
cv::gpu::device::transform(src1, dst, device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
cv::gpu::device::transform(src1, dst, device::bind2nd(minimum<T>(), src2), WithOutMask(), stream);
}
template void min_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
@@ -1588,7 +1588,7 @@ namespace cv { namespace gpu { namespace device
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, T src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
cv::gpu::device::transform(src1, dst, device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
cv::gpu::device::transform(src1, dst, device::bind2nd(maximum<T>(), src2), WithOutMask(), stream);
}
template void max_gpu<uchar >(const DevMem2Db& src1, uchar src2, const DevMem2Db& dst, cudaStream_t stream);
@@ -1647,12 +1647,12 @@ namespace cv { namespace gpu { namespace device
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, T thresh, T maxVal, cudaStream_t stream);
static const caller_t callers[] =
static const caller_t callers[] =
{
threshold_caller<thresh_binary_func, T>,
threshold_caller<thresh_binary_inv_func, T>,
threshold_caller<thresh_trunc_func, T>,
threshold_caller<thresh_to_zero_func, T>,
threshold_caller<thresh_binary_func, T>,
threshold_caller<thresh_binary_inv_func, T>,
threshold_caller<thresh_trunc_func, T>,
threshold_caller<thresh_to_zero_func, T>,
threshold_caller<thresh_to_zero_inv_func, T>
};
@@ -1671,14 +1671,14 @@ namespace cv { namespace gpu { namespace device
// pow
template<typename T, bool Signed = device::numeric_limits<T>::is_signed> struct PowOp : unary_function<T, T>
{
{
float power;
PowOp(float power_) : power(power_) {}
__device__ __forceinline__ T operator()(const T& e) const
{
{
return saturate_cast<T>(__powf((float)e, power));
}
}
};
template<typename T> struct PowOp<T, true> : unary_function<T, T>
@@ -1688,11 +1688,11 @@ namespace cv { namespace gpu { namespace device
__device__ __forceinline__ float operator()(const T& e) const
{
T res = saturate_cast<T>(__powf((float)e, power));
T res = saturate_cast<T>(__powf((float)e, power));
if ( (e < 0) && (1 & (int)power) )
res *= -1;
return res;
res *= -1;
return res;
}
};
@@ -1736,7 +1736,7 @@ namespace cv { namespace gpu { namespace device
void pow_caller(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream)
{
cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, PowOp<T>(power), WithOutMask(), stream);
}
}
template void pow_caller<uchar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);
template void pow_caller<schar>(const DevMem2Db& src, float power, DevMem2Db dst, cudaStream_t stream);