diff --git a/modules/gpu/perf/perf_main.cpp b/modules/gpu/perf/perf_main.cpp
index a7ac1ccce..53a19ca41 100644
--- a/modules/gpu/perf/perf_main.cpp
+++ b/modules/gpu/perf/perf_main.cpp
@@ -44,4 +44,11 @@
 
 using namespace perf;
 
-CV_PERF_TEST_MAIN(gpu, printCudaInfo())
+static const char * impls[] = {
+#ifdef HAVE_CUDA
+    "cuda",
+#endif
+    "plain"
+};
+
+CV_PERF_TEST_MAIN_WITH_IMPLS(gpu, impls, printCudaInfo())
diff --git a/modules/nonfree/perf/perf_main.cpp b/modules/nonfree/perf/perf_main.cpp
index de1242149..d5f4a1a51 100644
--- a/modules/nonfree/perf/perf_main.cpp
+++ b/modules/nonfree/perf/perf_main.cpp
@@ -1,4 +1,11 @@
 #include "perf_precomp.hpp"
 #include "opencv2/ts/gpu_perf.hpp"
 
-CV_PERF_TEST_MAIN(nonfree, perf::printCudaInfo())
+static const char * impls[] = {
+#ifdef HAVE_CUDA
+    "cuda",
+#endif
+    "plain"
+};
+
+CV_PERF_TEST_MAIN_WITH_IMPLS(nonfree, impls, perf::printCudaInfo())
diff --git a/modules/superres/perf/perf_main.cpp b/modules/superres/perf/perf_main.cpp
index adc69e6e8..0a8ab5dea 100644
--- a/modules/superres/perf/perf_main.cpp
+++ b/modules/superres/perf/perf_main.cpp
@@ -44,4 +44,11 @@
 
 using namespace perf;
 
-CV_PERF_TEST_MAIN(superres, printCudaInfo())
+static const char * impls[] = {
+#ifdef HAVE_CUDA
+    "cuda",
+#endif
+    "plain"
+};
+
+CV_PERF_TEST_MAIN_WITH_IMPLS(superres, impls, printCudaInfo())
diff --git a/modules/ts/include/opencv2/ts/ts_perf.hpp b/modules/ts/include/opencv2/ts/ts_perf.hpp
index fe5765515..1e68cd49b 100644
--- a/modules/ts/include/opencv2/ts/ts_perf.hpp
+++ b/modules/ts/include/opencv2/ts/ts_perf.hpp
@@ -210,18 +210,13 @@ private:
 #define SANITY_CHECK_KEYPOINTS(array, ...) ::perf::Regression::addKeypoints(this, #array, array , ## __VA_ARGS__)
 #define SANITY_CHECK_MATCHES(array, ...) ::perf::Regression::addMatches(this, #array, array , ## __VA_ARGS__)
 
-#ifdef HAVE_CUDA
 class CV_EXPORTS GpuPerf
 {
 public:
   static bool targetDevice();
 };
 
-# define PERF_RUN_GPU()  ::perf::GpuPerf::targetDevice()
-#else
-# define PERF_RUN_GPU()  false
-#endif
-
+#define PERF_RUN_GPU()  ::perf::GpuPerf::targetDevice()
 
 /*****************************************************************************************\
 *                            Container for performance metrics                            *
@@ -263,7 +258,11 @@ public:
     TestBase();
 
     static void Init(int argc, const char* const argv[]);
+    static void Init(const std::vector<std::string> & availableImpls,
+                     int argc, const char* const argv[]);
+    static void RecordRunParameters();
     static std::string getDataPath(const std::string& relativePath);
+    static std::string getSelectedImpl();
 
 protected:
     virtual void PerfTestBody() = 0;
@@ -477,15 +476,29 @@ CV_EXPORTS void PrintTo(const Size& sz, ::std::ostream* os);
     void fixture##_##name::PerfTestBody()
 
 
-#define CV_PERF_TEST_MAIN(testsuitname, ...) \
-int main(int argc, char **argv)\
-{\
+#define CV_PERF_TEST_MAIN_INTERNALS(modulename, impls, ...) \
     while (++argc >= (--argc,-1)) {__VA_ARGS__; break;} /*this ugly construction is needed for VS 2005*/\
-    ::perf::Regression::Init(#testsuitname);\
-    ::perf::TestBase::Init(argc, argv);\
+    ::perf::Regression::Init(#modulename);\
+    ::perf::TestBase::Init(std::vector<std::string>(impls, impls + sizeof impls / sizeof *impls),\
+                           argc, argv);\
     ::testing::InitGoogleTest(&argc, argv);\
     cvtest::printVersionInfo();\
-    return RUN_ALL_TESTS();\
+    ::testing::Test::RecordProperty("cv_module_name", #modulename);\
+    ::perf::TestBase::RecordRunParameters();\
+    return RUN_ALL_TESTS();
+
+// impls must be an array, not a pointer; "plain" should always be one of the implementations
+#define CV_PERF_TEST_MAIN_WITH_IMPLS(modulename, impls, ...) \
+int main(int argc, char **argv)\
+{\
+    CV_PERF_TEST_MAIN_INTERNALS(modulename, impls, __VA_ARGS__)\
+}
+
+#define CV_PERF_TEST_MAIN(modulename, ...) \
+int main(int argc, char **argv)\
+{\
+    const char * plain_only[] = { "plain" };\
+    CV_PERF_TEST_MAIN_INTERNALS(modulename, plain_only, __VA_ARGS__)\
 }
 
 #define TEST_CYCLE_N(n) for(declare.iterations(n); startTimer(), next(); stopTimer())
diff --git a/modules/ts/src/ts_perf.cpp b/modules/ts/src/ts_perf.cpp
index c375e7c38..c2c1ee6bd 100644
--- a/modules/ts/src/ts_perf.cpp
+++ b/modules/ts/src/ts_perf.cpp
@@ -14,30 +14,10 @@ int64 TestBase::timeLimitDefault = 0;
 unsigned int TestBase::iterationsLimitDefault = (unsigned int)(-1);
 int64 TestBase::_timeadjustment = 0;
 
-const std::string command_line_keys =
-    "{   |perf_max_outliers   |8        |percent of allowed outliers}"
-    "{   |perf_min_samples    |10       |minimal required numer of samples}"
-    "{   |perf_force_samples  |100      |force set maximum number of samples for all tests}"
-    "{   |perf_seed           |809564   |seed for random numbers generator}"
-    "{   |perf_threads        |-1       |the number of worker threads, if parallel execution is enabled}"
-    "{   |perf_write_sanity   |false    |create new records for sanity checks}"
-    "{   |perf_verify_sanity  |false    |fail tests having no regression data for sanity checks}"
-#ifdef ANDROID
-    "{   |perf_time_limit     |6.0      |default time limit for a single test (in seconds)}"
-    "{   |perf_affinity_mask  |0        |set affinity mask for the main thread}"
-    "{   |perf_log_power_checkpoints  | |additional xml logging for power measurement}"
-#else
-    "{   |perf_time_limit     |3.0      |default time limit for a single test (in seconds)}"
-#endif
-    "{   |perf_max_deviation  |1.0      |}"
-    "{h  |help                |false    |print help info}"
-#ifdef HAVE_CUDA
-    "{   |perf_run_cpu        |false    |run GPU performance tests for analogical CPU functions}"
-    "{   |perf_cuda_device    |0        |run GPU test suite onto specific CUDA capable device}"
-    "{   |perf_cuda_info_only |false    |print an information about system and an available CUDA devices and then exit.}"
-#endif
-;
+// Item [0] will be considered the default implementation.
+static std::vector<std::string> available_impls;
 
+static std::string  param_impl;
 static double       param_max_outliers;
 static double       param_max_deviation;
 static unsigned int param_min_samples;
@@ -48,7 +28,6 @@ static int          param_threads;
 static bool         param_write_sanity;
 static bool         param_verify_sanity;
 #ifdef HAVE_CUDA
-static bool         param_run_cpu;
 static int          param_cuda_device;
 #endif
 
@@ -577,11 +556,12 @@ Regression& Regression::operator() (const std::string& name, cv::InputArray arra
 
     std::string nodename = getCurrentTestNodeName();
 
-#ifdef HAVE_CUDA
-    static const std::string prefix = (param_run_cpu)? "CPU_" : "GPU_";
+    // This is a hack for compatibility and it should eventually get removed.
+    // gpu's tests don't even have CPU sanity data anymore.
     if(suiteName == "gpu")
-        nodename = prefix + nodename;
-#endif
+    {
+        nodename = (PERF_RUN_GPU() ? "GPU_" : "CPU_") + nodename;
+    }
 
     cv::FileNode n = rootIn[nodename];
     if(n.isNone())
@@ -646,6 +626,43 @@ performance_metrics::performance_metrics()
 
 void TestBase::Init(int argc, const char* const argv[])
 {
+    std::vector<std::string> plain_only;
+    plain_only.push_back("plain");
+    TestBase::Init(plain_only, argc, argv);
+}
+
+void TestBase::Init(const std::vector<std::string> & availableImpls,
+                 int argc, const char* const argv[])
+{
+    available_impls = availableImpls;
+
+    const std::string command_line_keys =
+        "{   |perf_max_outliers           |8        |percent of allowed outliers}"
+        "{   |perf_min_samples            |10       |minimal required numer of samples}"
+        "{   |perf_force_samples          |100      |force set maximum number of samples for all tests}"
+        "{   |perf_seed                   |809564   |seed for random numbers generator}"
+        "{   |perf_threads                |-1       |the number of worker threads, if parallel execution is enabled}"
+        "{   |perf_write_sanity           |false    |create new records for sanity checks}"
+        "{   |perf_verify_sanity          |false    |fail tests having no regression data for sanity checks}"
+        "{   |perf_impl                   |" + available_impls[0] +
+                                                   "|the implementation variant of functions under test}"
+        "{   |perf_list_impls             |false    |list available implementation variants and exit}"
+        "{   |perf_run_cpu                |false    |deprecated, equivalent to --perf_impl=plain}"
+#ifdef ANDROID
+        "{   |perf_time_limit             |6.0      |default time limit for a single test (in seconds)}"
+        "{   |perf_affinity_mask          |0        |set affinity mask for the main thread}"
+        "{   |perf_log_power_checkpoints  |         |additional xml logging for power measurement}"
+#else
+        "{   |perf_time_limit             |3.0      |default time limit for a single test (in seconds)}"
+#endif
+        "{   |perf_max_deviation          |1.0      |}"
+        "{h  |help                        |false    |print help info}"
+#ifdef HAVE_CUDA
+        "{   |perf_cuda_device            |0        |run GPU test suite onto specific CUDA capable device}"
+        "{   |perf_cuda_info_only         |false    |print an information about system and an available CUDA devices and then exit.}"
+#endif
+    ;
+
     cv::CommandLineParser args(argc, argv, command_line_keys.c_str());
     if (args.get<bool>("help"))
     {
@@ -656,6 +673,7 @@ void TestBase::Init(int argc, const char* const argv[])
 
     ::testing::AddGlobalTestEnvironment(new PerfEnvironment);
 
+    param_impl          = args.get<bool>("perf_run_cpu") ? "plain" : args.get<std::string>("perf_impl");
     param_max_outliers  = std::min(100., std::max(0., args.get<double>("perf_max_outliers")));
     param_min_samples   = std::max(1u, args.get<unsigned int>("perf_min_samples"));
     param_max_deviation = std::max(0., args.get<double>("perf_max_deviation"));
@@ -670,19 +688,41 @@ void TestBase::Init(int argc, const char* const argv[])
     log_power_checkpoints = args.get<bool>("perf_log_power_checkpoints");
 #endif
 
+    bool param_list_impls = args.get<bool>("perf_list_impls");
+
+    if (param_list_impls)
+    {
+        fputs("Available implementation variants:", stdout);
+        for (size_t i = 0; i < available_impls.size(); ++i) {
+            putchar(' ');
+            fputs(available_impls[i].c_str(), stdout);
+        }
+        putchar('\n');
+        exit(0);
+    }
+
+    if (std::find(available_impls.begin(), available_impls.end(), param_impl) == available_impls.end())
+    {
+        printf("No such implementation: %s\n", param_impl.c_str());
+        exit(1);
+    }
+
 #ifdef HAVE_CUDA
 
     bool printOnly        = args.get<bool>("perf_cuda_info_only");
 
     if (printOnly)
         exit(0);
+#endif
+
+    if (available_impls.size() > 1)
+        printf("[----------]\n[   INFO   ] \tImplementation variant: %s.\n[----------]\n", param_impl.c_str()), fflush(stdout);
+
+#ifdef HAVE_CUDA
 
-    param_run_cpu         = args.get<bool>("perf_run_cpu");
     param_cuda_device      = std::max(0, std::min(cv::gpu::getCudaEnabledDeviceCount(), args.get<int>("perf_cuda_device")));
 
-    if (param_run_cpu)
-        printf("[----------]\n[ GPU INFO ] \tRun test suite on CPU.\n[----------]\n"), fflush(stdout);
-    else
+    if (param_impl == "cuda")
     {
         cv::gpu::DeviceInfo info(param_cuda_device);
         if (!info.isCompatible())
@@ -708,6 +748,18 @@ void TestBase::Init(int argc, const char* const argv[])
     _timeadjustment = _calibrate();
 }
 
+void TestBase::RecordRunParameters()
+{
+    ::testing::Test::RecordProperty("cv_implementation", param_impl);
+    ::testing::Test::RecordProperty("cv_num_threads", param_threads);
+}
+
+std::string TestBase::getSelectedImpl()
+{
+    return param_impl;
+}
+
+
 int64 TestBase::_calibrate()
 {
     class _helper : public ::perf::TestBase
@@ -1325,12 +1377,10 @@ void perf::sort(std::vector<cv::KeyPoint>& pts, cv::InputOutputArray descriptors
 /*****************************************************************************************\
 *                                  ::perf::GpuPerf
 \*****************************************************************************************/
-#ifdef HAVE_CUDA
 bool perf::GpuPerf::targetDevice()
 {
-    return !param_run_cpu;
+    return param_impl == "cuda";
 }
-#endif
 
 /*****************************************************************************************\
 *                                  ::perf::PrintTo