diff --git a/modules/gpu/include/opencv2/gpu/gpu.hpp b/modules/gpu/include/opencv2/gpu/gpu.hpp index ee9b734f1..19c6a8eaf 100644 --- a/modules/gpu/include/opencv2/gpu/gpu.hpp +++ b/modules/gpu/include/opencv2/gpu/gpu.hpp @@ -482,6 +482,10 @@ namespace cv ////////////////////////////// Arithmetics /////////////////////////////////// + //! implements generalized matrix product algorithm GEMM from BLAS + CV_EXPORTS void gemm(const GpuMat& src1, const GpuMat& src2, double alpha, + const GpuMat& src3, double beta, GpuMat& dst, int flags = 0, Stream& stream = Stream::Null()); + //! transposes the matrix //! supports matrix with element size = 1, 4 and 8 bytes (CV_8UC1, CV_8UC4, CV_16UC2, CV_32FC1, etc) CV_EXPORTS void transpose(const GpuMat& src1, GpuMat& dst, Stream& stream = Stream::Null()); diff --git a/modules/gpu/perf/perf_arithm.cpp b/modules/gpu/perf/perf_arithm.cpp index 75a2bf8d7..8e34023bb 100644 --- a/modules/gpu/perf/perf_arithm.cpp +++ b/modules/gpu/perf/perf_arithm.cpp @@ -747,3 +747,34 @@ PERF_TEST_P(DevInfo_Size_MatType_FlipCode, reduce, testing::Combine(testing::Val SANITY_CHECK(dst_host); } + +PERF_TEST_P(DevInfo_Size, gemm, testing::Combine(testing::ValuesIn(devices()), + testing::Values(cv::Size(512, 512), cv::Size(1024, 1024), cv::Size(2048, 2048), cv::Size(4096, 4096)))) +{ + DeviceInfo devInfo = std::tr1::get<0>(GetParam()); + Size size = std::tr1::get<1>(GetParam()); + + setDevice(devInfo.deviceID()); + + Mat src1_host(size, CV_32FC1); + Mat src2_host(size, CV_32FC1); + Mat src3_host(size, CV_32FC1); + + declare.in(src1_host, src2_host, src3_host, WARMUP_RNG); + + GpuMat src1(src1_host); + GpuMat src2(src2_host); + GpuMat src3(src3_host); + GpuMat dst(size, CV_32FC1); + + declare.time(5.0); + + SIMPLE_TEST_CYCLE() + { + gemm(src1, src2, 1.0, src3, 1.0, dst); + } + + Mat dst_host = dst; + + SANITY_CHECK(dst_host); +} diff --git a/modules/gpu/src/arithm.cpp b/modules/gpu/src/arithm.cpp index ec938fea5..fabb3dfbc 100644 --- a/modules/gpu/src/arithm.cpp +++ b/modules/gpu/src/arithm.cpp @@ -48,6 +48,7 @@ using namespace std; #if !defined (HAVE_CUDA) +void cv::gpu::gemm(const GpuMat&, const GpuMat&, double, const GpuMat&, double, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::transpose(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); } void cv::gpu::flip(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); } void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&, Stream&) { throw_nogpu(); } @@ -63,6 +64,133 @@ void cv::gpu::polarToCart(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool, #else /* !defined (HAVE_CUDA) */ +//////////////////////////////////////////////////////////////////////// +// gemm + +void cv::gpu::gemm(const GpuMat& src1, const GpuMat& src2, double alpha, const GpuMat& src3, double beta, GpuMat& dst, int flags, Stream& stream) +{ +#ifndef HAVE_CUBLAS + + OPENCV_GPU_UNUSED(src1); + OPENCV_GPU_UNUSED(src2); + OPENCV_GPU_UNUSED(alpha); + OPENCV_GPU_UNUSED(src3); + OPENCV_GPU_UNUSED(beta); + OPENCV_GPU_UNUSED(dst); + OPENCV_GPU_UNUSED(flags); + OPENCV_GPU_UNUSED(stream); + + throw_nogpu(); + +#else + + // CUBLAS works with column-major matrices + + CV_Assert(src1.type() == CV_32FC1 || src1.type() == CV_32FC2 || src1.type() == CV_64FC1 || src1.type() == CV_64FC2); + CV_Assert(src2.type() == src1.type() && (src3.empty() || src3.type() == src1.type())); + + bool tr1 = flags & GEMM_1_T; + bool tr2 = flags & GEMM_2_T; + bool tr3 = flags & GEMM_3_T; + + Size src1Size = tr1 ? Size(src1.rows, src1.cols) : src1.size(); + Size src2Size = tr2 ? Size(src2.rows, src2.cols) : src2.size(); + Size src3Size = tr3 ? Size(src3.rows, src3.cols) : src3.size(); + Size dstSize(src2Size.width, src1Size.height); + + CV_Assert(src1Size.width == src2Size.height); + CV_Assert(src3.empty() || src3Size == dstSize); + + dst.create(dstSize, CV_32FC1); + + if (beta != 0) + { + if (src3.empty()) + { + if (stream) + stream.enqueueMemSet(dst, Scalar::all(0)); + else + dst.setTo(Scalar::all(0)); + } + else + { + if (tr3) + { + transpose(src3, dst, stream); + } + else + { + if (stream) + stream.enqueueCopy(src3, dst); + else + src3.copyTo(dst); + } + } + } + + cublasHandle_t handle; + cublasSafeCall( cublasCreate_v2(&handle) ); + + cublasSafeCall( cublasSetStream_v2(handle, StreamAccessor::getStream(stream)) ); + + cublasSafeCall( cublasSetPointerMode_v2(handle, CUBLAS_POINTER_MODE_HOST) ); + + const float alphaf = static_cast(alpha); + const float betaf = static_cast(beta); + + const cuComplex alphacf = make_cuComplex(alphaf, 0); + const cuComplex betacf = make_cuComplex(betaf, 0); + + const cuDoubleComplex alphac = make_cuDoubleComplex(alpha, 0); + const cuDoubleComplex betac = make_cuDoubleComplex(beta, 0); + + cublasOperation_t transa = tr2 ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t transb = tr1 ? CUBLAS_OP_T : CUBLAS_OP_N; + + switch (src1.type()) + { + case CV_32FC1: + cublasSafeCall( cublasSgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, + &alphaf, + src2.ptr(), static_cast(src2.step / sizeof(float)), + src1.ptr(), static_cast(src1.step / sizeof(float)), + &betaf, + dst.ptr(), static_cast(dst.step / sizeof(float))) ); + break; + + case CV_64FC1: + cublasSafeCall( cublasDgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, + &alpha, + src2.ptr(), static_cast(src2.step / sizeof(double)), + src1.ptr(), static_cast(src1.step / sizeof(double)), + &beta, + dst.ptr(), static_cast(dst.step / sizeof(double))) ); + break; + + case CV_32FC2: + cublasSafeCall( cublasCgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, + &alphacf, + src2.ptr(), static_cast(src2.step / sizeof(cuComplex)), + src1.ptr(), static_cast(src1.step / sizeof(cuComplex)), + &betacf, + dst.ptr(), static_cast(dst.step / sizeof(cuComplex))) ); + break; + + case CV_64FC2: + cublasSafeCall( cublasZgemm_v2(handle, transa, transb, tr2 ? src2.rows : src2.cols, tr1 ? src1.cols : src1.rows, tr2 ? src2.cols : src2.rows, + &alphac, + src2.ptr(), static_cast(src2.step / sizeof(cuDoubleComplex)), + src1.ptr(), static_cast(src1.step / sizeof(cuDoubleComplex)), + &betac, + dst.ptr(), static_cast(dst.step / sizeof(cuDoubleComplex))) ); + break; + } + + cublasSafeCall( cublasDestroy_v2(handle) ); + +#endif +} + //////////////////////////////////////////////////////////////////////// // transpose diff --git a/modules/gpu/src/element_operations.cpp b/modules/gpu/src/element_operations.cpp index ae2f41a90..51acac7d0 100644 --- a/modules/gpu/src/element_operations.cpp +++ b/modules/gpu/src/element_operations.cpp @@ -434,7 +434,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double {0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, 0/*multiply_gpu*/, multiply_gpu} }; - CV_Assert(src.channels() == 1); + //CV_Assert(src.channels() == 1); if (dtype < 0) dtype = src.depth(); @@ -463,7 +463,7 @@ void cv::gpu::multiply(const GpuMat& src, const Scalar& sc, GpuMat& dst, double const func_t func = funcs[src.depth()][dst.depth()]; CV_Assert(func != 0); - func(src, sc.val[0], dst, scale, stream); + func(src.reshape(1), sc.val[0], dst.reshape(1), scale, stream); } //////////////////////////////////////////////////////////////////////// diff --git a/modules/gpu/test/test_arithm.cpp b/modules/gpu/test/test_arithm.cpp index ed9b6cfc3..f3a2039e0 100644 --- a/modules/gpu/test/test_arithm.cpp +++ b/modules/gpu/test/test_arithm.cpp @@ -1860,4 +1860,68 @@ INSTANTIATE_TEST_CASE_P(Arithm, Reduce, testing::Combine( testing::Values(0, 1), testing::Values((int)CV_REDUCE_SUM, (int)CV_REDUCE_AVG, (int)CV_REDUCE_MAX, (int)CV_REDUCE_MIN))); +////////////////////////////////////////////////////////////////////////////// +// gemm + +struct GEMM : testing::TestWithParam< std::tr1::tuple > +{ + cv::gpu::DeviceInfo devInfo; + int type; + int flags; + + int size; + cv::Mat src1; + cv::Mat src2; + cv::Mat src3; + double alpha; + double beta; + + cv::Mat dst_gold; + + virtual void SetUp() + { + devInfo = std::tr1::get<0>(GetParam()); + type = std::tr1::get<1>(GetParam()); + flags = std::tr1::get<2>(GetParam()); + + cv::gpu::setDevice(devInfo.deviceID()); + + cv::RNG& rng = cvtest::TS::ptr()->get_rng(); + + size = rng.uniform(100, 500); + + src1 = cvtest::randomMat(rng, cv::Size(size, size), type, -10.0, 10.0, false); + src2 = cvtest::randomMat(rng, cv::Size(size, size), type, -10.0, 10.0, false); + src3 = cvtest::randomMat(rng, cv::Size(size, size), type, -10.0, 10.0, false); + alpha = rng.uniform(-10.0, 10.0); + beta = rng.uniform(-10.0, 10.0); + + cv::gemm(src1, src2, alpha, src3, beta, dst_gold, flags); + } +}; + +TEST_P(GEMM, Accuracy) +{ + PRINT_PARAM(devInfo); + PRINT_TYPE(type); + PRINT_PARAM(flags); + + cv::Mat dst; + + ASSERT_NO_THROW( + cv::gpu::GpuMat dev_dst; + + cv::gpu::gemm(cv::gpu::GpuMat(src1), cv::gpu::GpuMat(src2), alpha, cv::gpu::GpuMat(src3), beta, dev_dst, flags); + + dev_dst.download(dst); + ); + + EXPECT_MAT_NEAR(dst_gold, dst, 1e-1); +} + +INSTANTIATE_TEST_CASE_P(Arithm, GEMM, testing::Combine( + testing::ValuesIn(devices()), + testing::Values(CV_32FC1, CV_32FC2), + testing::Values(0, (int)cv::GEMM_1_T, (int)cv::GEMM_2_T, (int)cv::GEMM_3_T))); + #endif // HAVE_CUDA diff --git a/samples/gpu/performance/tests.cpp b/samples/gpu/performance/tests.cpp index 132b76af4..f4a965bb1 100644 --- a/samples/gpu/performance/tests.cpp +++ b/samples/gpu/performance/tests.cpp @@ -1471,3 +1471,33 @@ TEST(reduce) GPU_OFF; } } + + +TEST(gemm) +{ + Mat src1, src2, src3, dst; + gpu::GpuMat d_src1, d_src2, d_src3, d_dst; + + for (int size = 512; size <= 2048; size *= 2) + { + SUBTEST << "size " << size << ", 32FC1"; + + gen(src1, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10)); + gen(src2, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10)); + gen(src3, size, size, CV_32FC1, Scalar::all(-10), Scalar::all(10)); + dst.create(src1.size(), src1.type()); + + CPU_ON; + gemm(src1, src2, 1.0, src3, 1.0, dst); + CPU_OFF; + + d_src1 = src1; + d_src2 = src2; + d_src3 = src3; + d_dst.create(d_src1.size(), d_src1.type()); + + GPU_ON; + gpu::gemm(d_src1, d_src2, 1.0, d_src3, 1.0, d_dst); + GPU_OFF; + } +}