Merge pull request #2045 from SpecLad:merge-2.4

This commit is contained in:
Roman Donchenko
2013-12-25 16:18:00 +04:00
committed by OpenCV Buildbot
41 changed files with 3627 additions and 952 deletions

View File

@@ -903,7 +903,7 @@ So, the function chooses an operation mode depending on the flags and size of th
* When ``DFT_COMPLEX_OUTPUT`` is set, the output is a complex matrix of the same size as input.
* When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DCT_ROWS`` flag), each row of the output matrix looks like the first row of the matrix above.
* When ``DFT_COMPLEX_OUTPUT`` is not set, the output is a real matrix of the same size as input. In case of 2D transform, it uses the packed format as shown above. In case of a single 1D transform, it looks like the first row of the matrix above. In case of multiple 1D transforms (when using the ``DFT_ROWS`` flag), each row of the output matrix looks like the first row of the matrix above.
* If the input array is complex and either ``DFT_INVERSE`` or ``DFT_REAL_OUTPUT`` are not set, the output is a complex array of the same size as input. The function performs a forward or inverse 1D or 2D transform of the whole input array or each row of the input array independently, depending on the flags ``DFT_INVERSE`` and ``DFT_ROWS``.

View File

@@ -2577,7 +2577,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
DCTFunc dct_func = dct_tbl[(int)inv + (depth == CV_64F)*2];
if( (flags & DFT_ROWS) || src.rows == 1 ||
if( (flags & DCT_ROWS) || src.rows == 1 ||
(src.cols == 1 && (src.isContinuous() && dst.isContinuous())))
{
stage = end_stage = 0;
@@ -2597,7 +2597,7 @@ void cv::dct( InputArray _src0, OutputArray _dst, int flags )
{
len = src.cols;
count = src.rows;
if( len == 1 && !(flags & DFT_ROWS) )
if( len == 1 && !(flags & DCT_ROWS) )
{
len = src.rows;
count = 1;

View File

@@ -2760,39 +2760,24 @@ void cv::transpose( InputArray _src, OutputArray _dst )
}
////////////////////////////////////// completeSymm /////////////////////////////////////////
void cv::completeSymm( InputOutputArray _m, bool LtoR )
{
Mat m = _m.getMat();
CV_Assert( m.dims <= 2 );
size_t step = m.step, esz = m.elemSize();
CV_Assert( m.dims <= 2 && m.rows == m.cols );
int i, j, nrows = m.rows, type = m.type();
int j0 = 0, j1 = nrows;
CV_Assert( m.rows == m.cols );
int rows = m.rows;
int j0 = 0, j1 = rows;
if( type == CV_32FC1 || type == CV_32SC1 )
uchar* data = m.data;
for( int i = 0; i < rows; i++ )
{
int* data = (int*)m.data;
size_t step = m.step/sizeof(data[0]);
for( i = 0; i < nrows; i++ )
{
if( !LtoR ) j1 = i; else j0 = i+1;
for( j = j0; j < j1; j++ )
data[i*step + j] = data[j*step + i];
}
if( !LtoR ) j1 = i; else j0 = i+1;
for( int j = j0; j < j1; j++ )
memcpy(data + (i*step + j*esz), data + (j*step + i*esz), esz);
}
else if( type == CV_64FC1 )
{
double* data = (double*)m.data;
size_t step = m.step/sizeof(data[0]);
for( i = 0; i < nrows; i++ )
{
if( !LtoR ) j1 = i; else j0 = i+1;
for( j = j0; j < j1; j++ )
data[i*step + j] = data[j*step + i];
}
}
else
CV_Error( CV_StsUnsupportedFormat, "" );
}

View File

@@ -222,6 +222,12 @@ elseif(HAVE_QTKIT)
list(APPEND HIGHGUI_LIBRARIES "-framework QTKit" "-framework QuartzCore" "-framework AppKit")
endif()
if(HAVE_INTELPERC)
list(APPEND highgui_srcs src/cap_intelperc.cpp)
ocv_include_directories(${INTELPERC_INCLUDE_DIR})
list(APPEND HIGHGUI_LIBRARIES ${INTELPERC_LIBRARIES})
endif(HAVE_INTELPERC)
if(IOS)
add_definitions(-DHAVE_IOS=1)
list(APPEND highgui_srcs src/ios_conversions.mm src/cap_ios_abstract_camera.mm src/cap_ios_photo_camera.mm src/cap_ios_video_camera.mm)

View File

@@ -271,7 +271,8 @@ enum { CAP_ANY = 0, // autodetect
CAP_XIAPI = 1100, // XIMEA Camera API
CAP_AVFOUNDATION = 1200, // AVFoundation framework for iOS (OS X Lion will have the same API)
CAP_GIGANETIX = 1300, // Smartek Giganetix GigEVisionSDK
CAP_MSMF = 1400 // Microsoft Media Foundation (via videoInput)
CAP_MSMF = 1400, // Microsoft Media Foundation (via videoInput)
CAP_INTELPERC = 1500 // Intel Perceptual Computing SDK
};
// generic properties (based on DC1394 properties)
@@ -496,6 +497,26 @@ enum { CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006
};
enum { CAP_PROP_INTELPERC_PROFILE_COUNT = 11001,
CAP_PROP_INTELPERC_PROFILE_IDX = 11002,
CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE = 11003,
CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE = 11004,
CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD = 11005,
CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ = 11006,
CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT = 11007
};
// Intel PerC streams
enum { CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
CAP_INTELPERC_GENERATORS_MASK = CAP_INTELPERC_DEPTH_GENERATOR + CAP_INTELPERC_IMAGE_GENERATOR
};
enum { CAP_INTELPERC_DEPTH_MAP = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
CAP_INTELPERC_UVDEPTH_MAP = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
CAP_INTELPERC_IR_MAP = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
CAP_INTELPERC_IMAGE = 3
};
class CV_EXPORTS_W VideoCapture
{

View File

@@ -313,7 +313,9 @@ enum
CV_CAP_AVFOUNDATION = 1200, // AVFoundation framework for iOS (OS X Lion will have the same API)
CV_CAP_GIGANETIX = 1300 // Smartek Giganetix GigEVisionSDK
CV_CAP_GIGANETIX = 1300, // Smartek Giganetix GigEVisionSDK
CV_CAP_INTELPERC = 1500 // Intel Perceptual Computing SDK
};
/* start capturing frames from camera: index = camera_index + domain_offset (CV_CAP_*) */
@@ -459,16 +461,29 @@ enum
CV_CAP_PROP_IOS_DEVICE_EXPOSURE = 9002,
CV_CAP_PROP_IOS_DEVICE_FLASH = 9003,
CV_CAP_PROP_IOS_DEVICE_WHITEBALANCE = 9004,
CV_CAP_PROP_IOS_DEVICE_TORCH = 9005
CV_CAP_PROP_IOS_DEVICE_TORCH = 9005,
// Properties of cameras available through Smartek Giganetix Ethernet Vision interface
/* --- Vladimir Litvinenko (litvinenko.vladimir@gmail.com) --- */
,CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
CV_CAP_PROP_GIGA_FRAME_OFFSET_X = 10001,
CV_CAP_PROP_GIGA_FRAME_OFFSET_Y = 10002,
CV_CAP_PROP_GIGA_FRAME_WIDTH_MAX = 10003,
CV_CAP_PROP_GIGA_FRAME_HEIGH_MAX = 10004,
CV_CAP_PROP_GIGA_FRAME_SENS_WIDTH = 10005,
CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006
CV_CAP_PROP_GIGA_FRAME_SENS_HEIGH = 10006,
CV_CAP_PROP_INTELPERC_PROFILE_COUNT = 11001,
CV_CAP_PROP_INTELPERC_PROFILE_IDX = 11002,
CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE = 11003,
CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE = 11004,
CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD = 11005,
CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ = 11006,
CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT = 11007,
// Intel PerC streams
CV_CAP_INTELPERC_DEPTH_GENERATOR = 1 << 29,
CV_CAP_INTELPERC_IMAGE_GENERATOR = 1 << 28,
CV_CAP_INTELPERC_GENERATORS_MASK = CV_CAP_INTELPERC_DEPTH_GENERATOR + CV_CAP_INTELPERC_IMAGE_GENERATOR
};
enum
@@ -549,6 +564,14 @@ enum
CV_CAP_ANDROID_ANTIBANDING_OFF
};
enum
{
CV_CAP_INTELPERC_DEPTH_MAP = 0, // Each pixel is a 16-bit integer. The value indicates the distance from an object to the camera's XY plane or the Cartesian depth.
CV_CAP_INTELPERC_UVDEPTH_MAP = 1, // Each pixel contains two 32-bit floating point values in the range of 0-1, representing the mapping of depth coordinates to the color coordinates.
CV_CAP_INTELPERC_IR_MAP = 2, // Each pixel is a 16-bit integer. The value indicates the intensity of the reflected laser beam.
CV_CAP_INTELPERC_IMAGE = 3
};
/* retrieve or set capture properties */
CVAPI(double) cvGetCaptureProperty( CvCapture* capture, int property_id );
CVAPI(int) cvSetCaptureProperty( CvCapture* capture, int property_id, double value );

View File

@@ -155,6 +155,9 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
#endif
#ifdef HAVE_GIGE_API
CV_CAP_GIGANETIX,
#endif
#ifdef HAVE_INTELPERC
CV_CAP_INTELPERC,
#endif
-1
};
@@ -193,6 +196,7 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
defined(HAVE_AVFOUNDATION) || \
defined(HAVE_ANDROID_NATIVE_CAMERA) || \
defined(HAVE_GIGE_API) || \
defined(HAVE_INTELPERC) || \
(0)
// local variable to memorize the captured device
CvCapture *capture;
@@ -342,6 +346,13 @@ CV_IMPL CvCapture * cvCreateCameraCapture (int index)
break; // CV_CAP_GIGANETIX
#endif
#ifdef HAVE_INTELPERC
case CV_CAP_INTELPERC:
capture = cvCreateCameraCapture_IntelPerC(index);
if (capture)
return capture;
break; // CV_CAP_INTEL_PERC
#endif
}
}

View File

@@ -0,0 +1,714 @@
#include "precomp.hpp"
#ifdef HAVE_INTELPERC
#include "pxcsession.h"
#include "pxcsmartptr.h"
#include "pxccapture.h"
class CvIntelPerCStreamBase
{
protected:
struct FrameInternal
{
IplImage* retrieveFrame()
{
if (m_mat.empty())
return NULL;
m_iplHeader = IplImage(m_mat);
return &m_iplHeader;
}
cv::Mat m_mat;
private:
IplImage m_iplHeader;
};
public:
CvIntelPerCStreamBase()
: m_profileIdx(-1)
, m_frameIdx(0)
, m_timeStampStartNS(0)
{
}
virtual ~CvIntelPerCStreamBase()
{
}
bool isValid()
{
return (m_device.IsValid() && m_stream.IsValid());
}
bool grabFrame()
{
if (!m_stream.IsValid())
return false;
if (-1 == m_profileIdx)
{
if (!setProperty(CV_CAP_PROP_INTELPERC_PROFILE_IDX, 0))
return false;
}
PXCSmartPtr<PXCImage> pxcImage; PXCSmartSP sp;
if (PXC_STATUS_NO_ERROR > m_stream->ReadStreamAsync(&pxcImage, &sp))
return false;
if (PXC_STATUS_NO_ERROR > sp->Synchronize())
return false;
if (0 == m_timeStampStartNS)
m_timeStampStartNS = pxcImage->QueryTimeStamp();
m_timeStamp = (double)((pxcImage->QueryTimeStamp() - m_timeStampStartNS) / 10000);
m_frameIdx++;
return prepareIplImage(pxcImage);
}
int getProfileIDX() const
{
return m_profileIdx;
}
public:
virtual bool initStream(PXCSession *session) = 0;
virtual double getProperty(int propIdx)
{
double ret = 0.0;
switch (propIdx)
{
case CV_CAP_PROP_INTELPERC_PROFILE_COUNT:
ret = (double)m_profiles.size();
break;
case CV_CAP_PROP_FRAME_WIDTH :
if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
ret = (double)m_profiles[m_profileIdx].imageInfo.width;
break;
case CV_CAP_PROP_FRAME_HEIGHT :
if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
ret = (double)m_profiles[m_profileIdx].imageInfo.height;
break;
case CV_CAP_PROP_FPS :
if ((0 <= m_profileIdx) && (m_profileIdx < m_profiles.size()))
{
ret = ((double)m_profiles[m_profileIdx].frameRateMin.numerator / (double)m_profiles[m_profileIdx].frameRateMin.denominator
+ (double)m_profiles[m_profileIdx].frameRateMax.numerator / (double)m_profiles[m_profileIdx].frameRateMax.denominator) / 2.0;
}
break;
case CV_CAP_PROP_POS_FRAMES:
ret = (double)m_frameIdx;
break;
case CV_CAP_PROP_POS_MSEC:
ret = m_timeStamp;
break;
};
return ret;
}
virtual bool setProperty(int propIdx, double propVal)
{
bool isSet = false;
switch (propIdx)
{
case CV_CAP_PROP_INTELPERC_PROFILE_IDX:
{
int propValInt = (int)propVal;
if ((0 <= propValInt) && (propValInt < m_profiles.size()))
{
if (m_profileIdx != propValInt)
{
m_profileIdx = propValInt;
if (m_stream.IsValid())
m_stream->SetProfile(&m_profiles[m_profileIdx]);
m_frameIdx = 0;
m_timeStampStartNS = 0;
}
isSet = true;
}
}
break;
};
return isSet;
}
protected:
PXCSmartPtr<PXCCapture::Device> m_device;
bool initDevice(PXCSession *session)
{
if (NULL == session)
return false;
pxcStatus sts = PXC_STATUS_NO_ERROR;
PXCSession::ImplDesc templat;
memset(&templat,0,sizeof(templat));
templat.group = PXCSession::IMPL_GROUP_SENSOR;
templat.subgroup= PXCSession::IMPL_SUBGROUP_VIDEO_CAPTURE;
for (int modidx = 0; PXC_STATUS_NO_ERROR <= sts; modidx++)
{
PXCSession::ImplDesc desc;
sts = session->QueryImpl(&templat, modidx, &desc);
if (PXC_STATUS_NO_ERROR > sts)
break;
PXCSmartPtr<PXCCapture> capture;
sts = session->CreateImpl<PXCCapture>(&desc, &capture);
if (!capture.IsValid())
continue;
/* enumerate devices */
for (int devidx = 0; PXC_STATUS_NO_ERROR <= sts; devidx++)
{
PXCSmartPtr<PXCCapture::Device> device;
sts = capture->CreateDevice(devidx, &device);
if (PXC_STATUS_NO_ERROR <= sts)
{
m_device = device.ReleasePtr();
return true;
}
}
}
return false;
}
PXCSmartPtr<PXCCapture::VideoStream> m_stream;
void initStreamImpl(PXCImage::ImageType type)
{
if (!m_device.IsValid())
return;
pxcStatus sts = PXC_STATUS_NO_ERROR;
/* enumerate streams */
for (int streamidx = 0; PXC_STATUS_NO_ERROR <= sts; streamidx++)
{
PXCCapture::Device::StreamInfo sinfo;
sts = m_device->QueryStream(streamidx, &sinfo);
if (PXC_STATUS_NO_ERROR > sts)
break;
if (PXCCapture::VideoStream::CUID != sinfo.cuid)
continue;
if (type != sinfo.imageType)
continue;
sts = m_device->CreateStream<PXCCapture::VideoStream>(streamidx, &m_stream);
if (PXC_STATUS_NO_ERROR == sts)
break;
m_stream.ReleaseRef();
}
}
protected:
std::vector<PXCCapture::VideoStream::ProfileInfo> m_profiles;
int m_profileIdx;
int m_frameIdx;
pxcU64 m_timeStampStartNS;
double m_timeStamp;
virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& /*pinfo*/)
{
return true;
}
void enumProfiles()
{
m_profiles.clear();
if (!m_stream.IsValid())
return;
pxcStatus sts = PXC_STATUS_NO_ERROR;
for (int profidx = 0; PXC_STATUS_NO_ERROR <= sts; profidx++)
{
PXCCapture::VideoStream::ProfileInfo pinfo;
sts = m_stream->QueryProfile(profidx, &pinfo);
if (PXC_STATUS_NO_ERROR > sts)
break;
if (validProfile(pinfo))
m_profiles.push_back(pinfo);
}
}
virtual bool prepareIplImage(PXCImage *pxcImage) = 0;
};
class CvIntelPerCStreamImage
: public CvIntelPerCStreamBase
{
public:
CvIntelPerCStreamImage()
{
}
virtual ~CvIntelPerCStreamImage()
{
}
virtual bool initStream(PXCSession *session)
{
if (!initDevice(session))
return false;
initStreamImpl(PXCImage::IMAGE_TYPE_COLOR);
if (!m_stream.IsValid())
return false;
enumProfiles();
return true;
}
virtual double getProperty(int propIdx)
{
switch (propIdx)
{
case CV_CAP_PROP_BRIGHTNESS:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_CONTRAST:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_SATURATION:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_HUE:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_GAMMA:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_SHARPNESS:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_GAIN:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_BACKLIGHT:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_EXPOSURE:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, &fret))
return (double)fret;
return 0.0;
}
break;
//Add image stream specific properties
}
return CvIntelPerCStreamBase::getProperty(propIdx);
}
virtual bool setProperty(int propIdx, double propVal)
{
switch (propIdx)
{
case CV_CAP_PROP_BRIGHTNESS:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BRIGHTNESS, (float)propVal));
}
break;
case CV_CAP_PROP_CONTRAST:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_CONTRAST, (float)propVal));
}
break;
case CV_CAP_PROP_SATURATION:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SATURATION, (float)propVal));
}
break;
case CV_CAP_PROP_HUE:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_HUE, (float)propVal));
}
break;
case CV_CAP_PROP_GAMMA:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAMMA, (float)propVal));
}
break;
case CV_CAP_PROP_SHARPNESS:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_SHARPNESS, (float)propVal));
}
break;
case CV_CAP_PROP_GAIN:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_GAIN, (float)propVal));
}
break;
case CV_CAP_PROP_BACKLIGHT:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_BACK_LIGHT_COMPENSATION, (float)propVal));
}
break;
case CV_CAP_PROP_EXPOSURE:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_COLOR_EXPOSURE, (float)propVal));
}
break;
//Add image stream specific properties
}
return CvIntelPerCStreamBase::setProperty(propIdx, propVal);
}
public:
IplImage* retrieveFrame()
{
return m_frame.retrieveFrame();
}
protected:
FrameInternal m_frame;
bool prepareIplImage(PXCImage *pxcImage)
{
if (NULL == pxcImage)
return false;
PXCImage::ImageInfo info;
pxcImage->QueryInfo(&info);
PXCImage::ImageData data;
pxcImage->AcquireAccess(PXCImage::ACCESS_READ, PXCImage::COLOR_FORMAT_RGB24, &data);
if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type)
return false;
cv::Mat temp(info.height, info.width, CV_8UC3, data.planes[0], data.pitches[0]);
temp.copyTo(m_frame.m_mat);
pxcImage->ReleaseAccess(&data);
return true;
}
};
class CvIntelPerCStreamDepth
: public CvIntelPerCStreamBase
{
public:
CvIntelPerCStreamDepth()
{
}
virtual ~CvIntelPerCStreamDepth()
{
}
virtual bool initStream(PXCSession *session)
{
if (!initDevice(session))
return false;
initStreamImpl(PXCImage::IMAGE_TYPE_DEPTH);
if (!m_stream.IsValid())
return false;
enumProfiles();
return true;
}
virtual double getProperty(int propIdx)
{
switch (propIdx)
{
case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD:
{
if (!m_device.IsValid())
return 0.0;
float fret = 0.0f;
if (PXC_STATUS_NO_ERROR == m_device->QueryProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, &fret))
return (double)fret;
return 0.0;
}
break;
case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_HORZ:
{
if (!m_device.IsValid())
return 0.0f;
PXCPointF32 ptf;
if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf))
return (double)ptf.x;
return 0.0;
}
break;
case CV_CAP_PROP_INTELPERC_DEPTH_FOCAL_LENGTH_VERT:
{
if (!m_device.IsValid())
return 0.0f;
PXCPointF32 ptf;
if (PXC_STATUS_NO_ERROR == m_device->QueryPropertyAsPoint(PXCCapture::Device::PROPERTY_DEPTH_FOCAL_LENGTH, &ptf))
return (double)ptf.y;
return 0.0;
}
break;
//Add depth stream sepcific properties
}
return CvIntelPerCStreamBase::getProperty(propIdx);
}
virtual bool setProperty(int propIdx, double propVal)
{
switch (propIdx)
{
case CV_CAP_PROP_INTELPERC_DEPTH_LOW_CONFIDENCE_VALUE:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_LOW_CONFIDENCE_VALUE, (float)propVal));
}
break;
case CV_CAP_PROP_INTELPERC_DEPTH_SATURATION_VALUE:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_SATURATION_VALUE, (float)propVal));
}
break;
case CV_CAP_PROP_INTELPERC_DEPTH_CONFIDENCE_THRESHOLD:
{
if (!m_device.IsValid())
return false;
return (PXC_STATUS_NO_ERROR == m_device->SetProperty(PXCCapture::Device::PROPERTY_DEPTH_CONFIDENCE_THRESHOLD, (float)propVal));
}
break;
//Add depth stream sepcific properties
}
return CvIntelPerCStreamBase::setProperty(propIdx, propVal);
}
public:
IplImage* retrieveDepthFrame()
{
return m_frameDepth.retrieveFrame();
}
IplImage* retrieveIRFrame()
{
return m_frameIR.retrieveFrame();
}
IplImage* retrieveUVFrame()
{
return m_frameUV.retrieveFrame();
}
protected:
virtual bool validProfile(const PXCCapture::VideoStream::ProfileInfo& pinfo)
{
return (PXCImage::COLOR_FORMAT_DEPTH == pinfo.imageInfo.format);
}
protected:
FrameInternal m_frameDepth;
FrameInternal m_frameIR;
FrameInternal m_frameUV;
bool prepareIplImage(PXCImage *pxcImage)
{
if (NULL == pxcImage)
return false;
PXCImage::ImageInfo info;
pxcImage->QueryInfo(&info);
PXCImage::ImageData data;
pxcImage->AcquireAccess(PXCImage::ACCESS_READ, &data);
if (PXCImage::SURFACE_TYPE_SYSTEM_MEMORY != data.type)
return false;
if (PXCImage::COLOR_FORMAT_DEPTH != data.format)
return false;
{
cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[0], data.pitches[0]);
temp.copyTo(m_frameDepth.m_mat);
}
{
cv::Mat temp(info.height, info.width, CV_16SC1, data.planes[1], data.pitches[1]);
temp.copyTo(m_frameIR.m_mat);
}
{
cv::Mat temp(info.height, info.width, CV_32FC2, data.planes[2], data.pitches[2]);
temp.copyTo(m_frameUV.m_mat);
}
pxcImage->ReleaseAccess(&data);
return true;
}
};
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
class CvCapture_IntelPerC : public CvCapture
{
public:
CvCapture_IntelPerC(int /*index*/)
: m_contextOpened(false)
{
pxcStatus sts = PXCSession_Create(&m_session);
if (PXC_STATUS_NO_ERROR > sts)
return;
m_contextOpened = m_imageStream.initStream(m_session);
m_contextOpened &= m_depthStream.initStream(m_session);
}
virtual ~CvCapture_IntelPerC(){}
virtual double getProperty(int propIdx)
{
double propValue = 0;
int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK;
if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
{
propValue = m_imageStream.getProperty(purePropIdx);
}
else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
{
propValue = m_depthStream.getProperty(purePropIdx);
}
else
{
propValue = m_depthStream.getProperty(purePropIdx);
}
return propValue;
}
virtual bool setProperty(int propIdx, double propVal)
{
bool isSet = false;
int purePropIdx = propIdx & ~CV_CAP_INTELPERC_GENERATORS_MASK;
if (CV_CAP_INTELPERC_IMAGE_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
{
isSet = m_imageStream.setProperty(purePropIdx, propVal);
}
else if (CV_CAP_INTELPERC_DEPTH_GENERATOR == (propIdx & CV_CAP_INTELPERC_GENERATORS_MASK))
{
isSet = m_depthStream.setProperty(purePropIdx, propVal);
}
else
{
isSet = m_depthStream.setProperty(purePropIdx, propVal);
}
return isSet;
}
bool grabFrame()
{
if (!isOpened())
return false;
bool isGrabbed = false;
if (m_depthStream.isValid())
isGrabbed = m_depthStream.grabFrame();
if ((m_imageStream.isValid()) && (-1 != m_imageStream.getProfileIDX()))
isGrabbed &= m_imageStream.grabFrame();
return isGrabbed;
}
virtual IplImage* retrieveFrame(int outputType)
{
IplImage* image = 0;
switch (outputType)
{
case CV_CAP_INTELPERC_DEPTH_MAP:
image = m_depthStream.retrieveDepthFrame();
break;
case CV_CAP_INTELPERC_UVDEPTH_MAP:
image = m_depthStream.retrieveUVFrame();
break;
case CV_CAP_INTELPERC_IR_MAP:
image = m_depthStream.retrieveIRFrame();
break;
case CV_CAP_INTELPERC_IMAGE:
image = m_imageStream.retrieveFrame();
break;
}
CV_Assert(NULL != image);
return image;
}
bool isOpened() const
{
return m_contextOpened;
}
protected:
bool m_contextOpened;
PXCSmartPtr<PXCSession> m_session;
CvIntelPerCStreamImage m_imageStream;
CvIntelPerCStreamDepth m_depthStream;
};
CvCapture* cvCreateCameraCapture_IntelPerC(int index)
{
CvCapture_IntelPerC* capture = new CvCapture_IntelPerC(index);
if( capture->isOpened() )
return capture;
delete capture;
return 0;
}
#endif //HAVE_INTELPERC

View File

@@ -128,6 +128,7 @@ CvCapture* cvCreateFileCapture_OpenNI( const char* filename );
CvCapture* cvCreateCameraCapture_Android( int index );
CvCapture* cvCreateCameraCapture_XIMEA( int index );
CvCapture* cvCreateCameraCapture_AVFoundation(int index);
CvCapture* cvCreateCameraCapture_IntelPerC(int index);
CVAPI(int) cvHaveImageReader(const char* filename);

View File

@@ -35,6 +35,7 @@
defined(HAVE_XIMEA) || \
defined(HAVE_AVFOUNDATION) || \
defined(HAVE_GIGE_API) || \
defined(HAVE_INTELPERC) || \
(0)
//defined(HAVE_ANDROID_NATIVE_CAMERA) || - enable after #1193
# define BUILD_WITH_CAMERA_SUPPORT 1

View File

@@ -3299,7 +3299,10 @@ public:
if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
{
bufxy = (*m1)(Rect(x, y, bcols, brows));
bufa = (*m2)(Rect(x, y, bcols, brows));
const ushort* sA = (const ushort*)(m2->data + m2->step*(y+y1)) + x;
for( x1 = 0; x1 < bcols; x1++ )
A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
}
else if( planar_input )
{
@@ -3680,7 +3683,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
{
for( x = 0; x < size.width; x++ )
{
int fxy = src2 ? src2[x] : 0;
int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
}
@@ -3689,7 +3692,7 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
{
for( x = 0; x < size.width; x++ )
{
int fxy = src2 ? src2[x] : 0;
int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
}

View File

@@ -18,6 +18,8 @@ class_ignore_list = (
const_ignore_list = (
"CV_CAP_OPENNI",
"CV_CAP_PROP_OPENNI_",
"CV_CAP_INTELPERC",
"CV_CAP_PROP_INTELPERC_"
"WINDOW_AUTOSIZE",
"CV_WND_PROP_",
"CV_WINDOW_",

View File

@@ -12,6 +12,7 @@
//
// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Copyright (C) 2013, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
@@ -66,8 +67,8 @@ uint read_sumTex(IMAGE_INT32 img, sampler_t sam, int2 coord, int rows, int cols,
uchar read_imgTex(IMAGE_INT8 img, sampler_t sam, float2 coord, int rows, int cols, int elemPerRow)
{
#ifdef DISABLE_IMAGE2D
int x = clamp(convert_int_rte(coord.x), 0, cols - 1);
int y = clamp(convert_int_rte(coord.y), 0, rows - 1);
int x = clamp(round(coord.x), 0, cols - 1);
int y = clamp(round(coord.y), 0, rows - 1);
return img[elemPerRow * y + x];
#else
return (uchar)read_imageui(img, sam, coord).x;
@@ -98,6 +99,7 @@ __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAM
#define CV_PI_F 3.14159265f
#endif
// Use integral image to calculate haar wavelets.
// N = 2
// for simple haar paatern
@@ -114,10 +116,10 @@ float icvCalcHaarPatternSum_2(
F d = 0;
int2 dx1 = convert_int2_rte(ratio * src[0]);
int2 dy1 = convert_int2_rte(ratio * src[1]);
int2 dx2 = convert_int2_rte(ratio * src[2]);
int2 dy2 = convert_int2_rte(ratio * src[3]);
int2 dx1 = convert_int2(round(ratio * src[0]));
int2 dy1 = convert_int2(round(ratio * src[1]));
int2 dx2 = convert_int2(round(ratio * src[2]));
int2 dy2 = convert_int2(round(ratio * src[3]));
F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
@@ -136,106 +138,9 @@ float icvCalcHaarPatternSum_2(
return (float)d;
}
// N = 3
float icvCalcHaarPatternSum_3(
IMAGE_INT32 sumTex,
__constant float4 *src,
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
F d = 0;
int4 dx1 = convert_int4_rte(ratio * src[0]);
int4 dy1 = convert_int4_rte(ratio * src[1]);
int4 dx2 = convert_int4_rte(ratio * src[2]);
int4 dy2 = convert_int4_rte(ratio * src[3]);
F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
return (float)d;
}
// N = 4
float icvCalcHaarPatternSum_4(
IMAGE_INT32 sumTex,
__constant float4 *src,
int oldSize,
int newSize,
int y, int x,
int rows, int cols, int elemPerRow)
{
float ratio = (float)newSize / oldSize;
F d = 0;
int4 dx1 = convert_int4_rte(ratio * src[0]);
int4 dy1 = convert_int4_rte(ratio * src[1]);
int4 dx2 = convert_int4_rte(ratio * src[2]);
int4 dy2 = convert_int4_rte(ratio * src[3]);
F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy1.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.x, y + dy2.x), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy1.x), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.x, y + dy2.x), rows, cols, elemPerRow );
d += t * src[4].x / ((dx2.x - dx1.x) * (dy2.x - dy1.x));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy1.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.y, y + dy2.y), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy1.y), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.y, y + dy2.y), rows, cols, elemPerRow );
d += t * src[4].y / ((dx2.y - dx1.y) * (dy2.y - dy1.y));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy1.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.z, y + dy2.z), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy1.z), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.z, y + dy2.z), rows, cols, elemPerRow );
d += t * src[4].z / ((dx2.z - dx1.z) * (dy2.z - dy1.z));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy1.w), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx1.w, y + dy2.w), rows, cols, elemPerRow );
t -= read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy1.w), rows, cols, elemPerRow );
t += read_sumTex( sumTex, sampler, (int2)(x + dx2.w, y + dy2.w), rows, cols, elemPerRow );
d += t * src[4].w / ((dx2.w - dx1.w) * (dy2.w - dy1.w));
return (float)d;
}
////////////////////////////////////////////////////////////////////////
// Hessian
__constant float4 c_DX[5] = { (float4)(0, 3, 6, 0), (float4)(2, 2, 2, 0), (float4)(3, 6, 9, 0), (float4)(7, 7, 7, 0), (float4)(1, -2, 1, 0) };
__constant float4 c_DY[5] = { (float4)(2, 2, 2, 0), (float4)(0, 3, 6, 0), (float4)(7, 7, 7, 0), (float4)(3, 6, 9, 0), (float4)(1, -2, 1, 0) };
__constant float4 c_DXY[5] = { (float4)(1, 5, 1, 5), (float4)(1, 1, 5, 5), (float4)(4, 8, 4, 8), (float4)(4, 4, 8, 8), (float4)(1, -1, -1, 1) };// Use integral image to calculate haar wavelets.
__inline int calcSize(int octave, int layer)
{
/* Wavelet size at first layer of first octave. */
@@ -250,6 +155,24 @@ __inline int calcSize(int octave, int layer)
return (HAAR_SIZE0 + HAAR_SIZE_INC * layer) << octave;
}
// Calculate a derivative in an axis-aligned direction (x or y). The "plus1"
// boxes contribute 1 * (area), and the "minus2" box contributes -2 * (area).
// So the final computation is plus1a + plus1b - 2 * minus2. The corners are
// labeled A, B, C, and D, with A being the top left, B being top right, C
// being bottom left, and D being bottom right.
F calcAxisAlignedDerivative(
int plus1a_A, int plus1a_B, int plus1a_C, int plus1a_D, F plus1a_scale,
int plus1b_A, int plus1b_B, int plus1b_C, int plus1b_D, F plus1b_scale,
int minus2_A, int minus2_B, int minus2_C, int minus2_D, F minus2_scale)
{
F plus1a = plus1a_A - plus1a_B - plus1a_C + plus1a_D;
F plus1b = plus1b_A - plus1b_B - plus1b_C + plus1b_D;
F minus2 = minus2_A - minus2_B - minus2_C + minus2_D;
return (plus1a / plus1a_scale -
2.0f * minus2 / minus2_scale +
plus1b / plus1b_scale);
}
//calculate targeted layer per-pixel determinant and trace with an integral image
__kernel void icvCalcLayerDetAndTrace(
@@ -264,7 +187,7 @@ __kernel void icvCalcLayerDetAndTrace(
int c_octave,
int c_layer_rows,
int sumTex_step
)
)
{
det_step /= sizeof(*det);
trace_step /= sizeof(*trace);
@@ -288,16 +211,103 @@ __kernel void icvCalcLayerDetAndTrace(
if (size <= c_img_rows && size <= c_img_cols && i < samples_i && j < samples_j)
{
const float dx = icvCalcHaarPatternSum_3(sumTex, c_DX , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
const float dy = icvCalcHaarPatternSum_3(sumTex, c_DY , 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
const float dxy = icvCalcHaarPatternSum_4(sumTex, c_DXY, 9, size, i << c_octave, j << c_octave, c_img_rows, c_img_cols, sumTex_step);
int x = j << c_octave;
int y = i << c_octave;
float ratio = (float)size / 9;
// Precompute some commonly used values, which are used to offset
// texture coordinates in the integral image.
int r1 = round(ratio);
int r2 = round(ratio * 2.0f);
int r3 = round(ratio * 3.0f);
int r4 = round(ratio * 4.0f);
int r5 = round(ratio * 5.0f);
int r6 = round(ratio * 6.0f);
int r7 = round(ratio * 7.0f);
int r8 = round(ratio * 8.0f);
int r9 = round(ratio * 9.0f);
// Calculate the approximated derivative in the x-direction
F d = 0;
{
// Some of the pixels needed to compute the derivative are
// repeated, so we only don't duplicate the fetch here.
int t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sumTex_step );
int t07 = read_sumTex( sumTex, sampler, (int2)(x, y + r7), c_img_rows, c_img_cols, sumTex_step );
int t32 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r2), c_img_rows, c_img_cols, sumTex_step );
int t37 = read_sumTex( sumTex, sampler, (int2)(x + r3, y + r7), c_img_rows, c_img_cols, sumTex_step );
int t62 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r2), c_img_rows, c_img_cols, sumTex_step );
int t67 = read_sumTex( sumTex, sampler, (int2)(x + r6, y + r7), c_img_rows, c_img_cols, sumTex_step );
int t92 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r2), c_img_rows, c_img_cols, sumTex_step );
int t97 = read_sumTex( sumTex, sampler, (int2)(x + r9, y + r7), c_img_rows, c_img_cols, sumTex_step );
d = calcAxisAlignedDerivative(t02, t07, t32, t37, (r3) * (r7 - r2),
t62, t67, t92, t97, (r9 - r6) * (r7 - r2),
t32, t37, t62, t67, (r6 - r3) * (r7 - r2));
}
const float dx = (float)d;
// Calculate the approximated derivative in the y-direction
d = 0;
{
// Some of the pixels needed to compute the derivative are
// repeated, so we only don't duplicate the fetch here.
int t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sumTex_step );
int t23 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r3), c_img_rows, c_img_cols, sumTex_step );
int t70 = read_sumTex( sumTex, sampler, (int2)(x + r7, y), c_img_rows, c_img_cols, sumTex_step );
int t73 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r3), c_img_rows, c_img_cols, sumTex_step );
int t26 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r6), c_img_rows, c_img_cols, sumTex_step );
int t76 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r6), c_img_rows, c_img_cols, sumTex_step );
int t29 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r9), c_img_rows, c_img_cols, sumTex_step );
int t79 = read_sumTex( sumTex, sampler, (int2)(x + r7, y + r9), c_img_rows, c_img_cols, sumTex_step );
d = calcAxisAlignedDerivative(t20, t23, t70, t73, (r7 - r2) * (r3),
t26, t29, t76, t79, (r7 - r2) * (r9 - r6),
t23, t26, t73, t76, (r7 - r2) * (r6 - r3));
}
const float dy = (float)d;
// Calculate the approximated derivative in the xy-direction
d = 0;
{
// There's no saving us here, we just have to get all of the pixels in
// separate fetches
F t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r1), c_img_rows, c_img_cols, sumTex_step );
t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r4), c_img_rows, c_img_cols, sumTex_step );
t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r1), c_img_rows, c_img_cols, sumTex_step );
t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sumTex_step );
d += t / ((r4 - r1) * (r4 - r1));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r1), c_img_rows, c_img_cols, sumTex_step );
t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r4), c_img_rows, c_img_cols, sumTex_step );
t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r1), c_img_rows, c_img_cols, sumTex_step );
t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r4), c_img_rows, c_img_cols, sumTex_step );
d -= t / ((r8 - r5) * (r4 - r1));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r1, y + r5), c_img_rows, c_img_cols, sumTex_step );
t -= read_sumTex( sumTex, sampler, (int2)(x + r1, y + r8), c_img_rows, c_img_cols, sumTex_step );
t -= read_sumTex( sumTex, sampler, (int2)(x + r4, y + r5), c_img_rows, c_img_cols, sumTex_step );
t += read_sumTex( sumTex, sampler, (int2)(x + r4, y + r8), c_img_rows, c_img_cols, sumTex_step );
d -= t / ((r4 - r1) * (r8 - r5));
t = 0;
t += read_sumTex( sumTex, sampler, (int2)(x + r5, y + r5), c_img_rows, c_img_cols, sumTex_step );
t -= read_sumTex( sumTex, sampler, (int2)(x + r5, y + r8), c_img_rows, c_img_cols, sumTex_step );
t -= read_sumTex( sumTex, sampler, (int2)(x + r8, y + r5), c_img_rows, c_img_cols, sumTex_step );
t += read_sumTex( sumTex, sampler, (int2)(x + r8, y + r8), c_img_rows, c_img_cols, sumTex_step );
d += t / ((r8 - r5) * (r8 - r5));
}
const float dxy = (float)d;
det [j + margin + det_step * (layer * c_layer_rows + i + margin)] = dx * dy - 0.81f * dxy * dxy;
trace[j + margin + trace_step * (layer * c_layer_rows + i + margin)] = dx + dy;
}
}
////////////////////////////////////////////////////////////////////////
// NONMAX
@@ -309,10 +319,10 @@ bool within_check(IMAGE_INT32 maskSumTex, int sum_i, int sum_j, int size, int ro
float d = 0;
int dx1 = convert_int_rte(ratio * c_DM[0]);
int dy1 = convert_int_rte(ratio * c_DM[1]);
int dx2 = convert_int_rte(ratio * c_DM[2]);
int dy2 = convert_int_rte(ratio * c_DM[3]);
int dx1 = round(ratio * c_DM[0]);
int dy1 = round(ratio * c_DM[1]);
int dx2 = round(ratio * c_DM[2]);
int dy2 = round(ratio * c_DM[3]);
float t = 0;
@@ -572,7 +582,7 @@ void icvFindMaximaInLayer(
}
// solve 3x3 linear system Ax=b for floating point input
inline bool solve3x3_float(volatile __local const float4 *A, volatile __local const float *b, volatile __local float *x)
inline bool solve3x3_float(const float4 *A, const float *b, float *x)
{
float det = A[0].x * (A[1].y * A[2].z - A[1].z * A[2].y)
- A[0].y * (A[1].x * A[2].z - A[1].z * A[2].x)
@@ -651,7 +661,7 @@ void icvInterpolateKeypoint(
if (get_local_id(0) == 0 && get_local_id(1) == 0 && get_local_id(2) == 0)
{
volatile __local float dD[3];
float dD[3];
//dx
dD[0] = -0.5f * (N9[1][1][2] - N9[1][1][0]);
@@ -660,7 +670,7 @@ void icvInterpolateKeypoint(
//ds
dD[2] = -0.5f * (N9[2][1][1] - N9[0][1][1]);
volatile __local float4 H[3];
float4 H[3];
//dxx
H[0].x = N9[1][1][0] - 2.0f * N9[1][1][1] + N9[1][1][2];
@@ -681,7 +691,7 @@ void icvInterpolateKeypoint(
//dss
H[2].z = N9[0][1][1] - 2.0f * N9[1][1][1] + N9[2][1][1];
volatile __local float x[3];
float x[3];
if (solve3x3_float(H, dD, x))
{
@@ -711,7 +721,7 @@ void icvInterpolateKeypoint(
sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the
wavelet pattern is balanced and symmetric around its center */
const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
const int grad_wav_size = 2 * round(2.0f * s);
// check when grad_wav_size is too big
if ((c_img_rows + 1) >= grad_wav_size && (c_img_cols + 1) >= grad_wav_size)
@@ -737,9 +747,12 @@ void icvInterpolateKeypoint(
////////////////////////////////////////////////////////////////////////
// Orientation
#define ORI_SEARCH_INC 5
#define ORI_WIN 60
#define ORI_SAMPLES 113
#define ORI_WIN 60
#define ORI_SAMPLES 113
// The distance between samples in the beginning of the the reduction
#define ORI_RESPONSE_REDUCTION_WIDTH 48
#define ORI_RESPONSE_ARRAY_SIZE (ORI_RESPONSE_REDUCTION_WIDTH * 2)
__constant float c_aptX[ORI_SAMPLES] = {-6, -5, -5, -5, -5, -5, -5, -5, -4, -4, -4, -4, -4, -4, -4, -4, -4, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6};
__constant float c_aptY[ORI_SAMPLES] = {0, -3, -2, -1, 0, 1, 2, 3, -4, -3, -2, -1, 0, 1, 2, 3, 4, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, -4, -3, -2, -1, 0, 1, 2, 3, 4, -3, -2, -1, 0, 1, 2, 3, 0};
@@ -833,12 +846,15 @@ void icvCalcOrientation(
__global float* featureDir = keypoints + ANGLE_ROW * keypoints_step;
volatile __local float s_X[128];
volatile __local float s_Y[128];
volatile __local float s_angle[128];
__local float s_X[ORI_SAMPLES];
__local float s_Y[ORI_SAMPLES];
__local float s_angle[ORI_SAMPLES];
volatile __local float s_sumx[32 * 4];
volatile __local float s_sumy[32 * 4];
// Need to allocate enough to make the reduction work without accessing
// past the end of the array.
__local float s_sumx[ORI_RESPONSE_ARRAY_SIZE];
__local float s_sumy[ORI_RESPONSE_ARRAY_SIZE];
__local float s_mod[ORI_RESPONSE_ARRAY_SIZE];
/* The sampling intervals and wavelet sized for selecting an orientation
and building the keypoint descriptor are defined relative to 's' */
@@ -849,28 +865,60 @@ void icvCalcOrientation(
sampled in a circle of radius 6s using wavelets of size 4s.
We ensure the gradient wavelet size is even to ensure the
wavelet pattern is balanced and symmetric around its center */
const int grad_wav_size = 2 * convert_int_rte(2.0f * s);
const int grad_wav_size = 2 * round(2.0f * s);
// check when grad_wav_size is too big
if ((c_img_rows + 1) < grad_wav_size || (c_img_cols + 1) < grad_wav_size)
return;
// Calc X, Y, angle and store it to shared memory
const int tid = get_local_id(1) * get_local_size(0) + get_local_id(0);
const int tid = get_local_id(0);
// Initialize values that are only used as part of the reduction later.
if (tid < ORI_RESPONSE_ARRAY_SIZE - ORI_LOCAL_SIZE) {
s_mod[tid + ORI_LOCAL_SIZE] = 0.0f;
}
float X = 0.0f, Y = 0.0f, angle = 0.0f;
float ratio = (float)grad_wav_size / 4;
if (tid < ORI_SAMPLES)
int r2 = round(ratio * 2.0);
int r4 = round(ratio * 4.0);
for (int i = tid; i < ORI_SAMPLES; i += ORI_LOCAL_SIZE )
{
float X = 0.0f, Y = 0.0f, angle = 0.0f;
const float margin = (float)(grad_wav_size - 1) / 2.0f;
const int x = convert_int_rte(featureX[get_group_id(0)] + c_aptX[tid] * s - margin);
const int y = convert_int_rte(featureY[get_group_id(0)] + c_aptY[tid] * s - margin);
const int x = round(featureX[get_group_id(0)] + c_aptX[i] * s - margin);
const int y = round(featureY[get_group_id(0)] + c_aptY[i] * s - margin);
if (y >= 0 && y < (c_img_rows + 1) - grad_wav_size &&
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
x >= 0 && x < (c_img_cols + 1) - grad_wav_size)
{
X = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NX, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
Y = c_aptW[tid] * icvCalcHaarPatternSum_2(sumTex, c_NY, 4, grad_wav_size, y, x, c_img_rows, c_img_cols, sum_step);
float apt = c_aptW[i];
// Compute the haar sum without fetching duplicate pixels.
float t00 = read_sumTex( sumTex, sampler, (int2)(x, y), c_img_rows, c_img_cols, sum_step);
float t02 = read_sumTex( sumTex, sampler, (int2)(x, y + r2), c_img_rows, c_img_cols, sum_step);
float t04 = read_sumTex( sumTex, sampler, (int2)(x, y + r4), c_img_rows, c_img_cols, sum_step);
float t20 = read_sumTex( sumTex, sampler, (int2)(x + r2, y), c_img_rows, c_img_cols, sum_step);
float t24 = read_sumTex( sumTex, sampler, (int2)(x + r2, y + r4), c_img_rows, c_img_cols, sum_step);
float t40 = read_sumTex( sumTex, sampler, (int2)(x + r4, y), c_img_rows, c_img_cols, sum_step);
float t42 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r2), c_img_rows, c_img_cols, sum_step);
float t44 = read_sumTex( sumTex, sampler, (int2)(x + r4, y + r4), c_img_rows, c_img_cols, sum_step);
F t = t00 - t04 - t20 + t24;
X -= t / ((r2) * (r4));
t = t20 - t24 - t40 + t44;
X += t / ((r4 - r2) * (r4));
t = t00 - t02 - t40 + t42;
Y += t / ((r2) * (r4));
t = t02 - t04 - t42 + t44;
Y -= t / ((r4) * (r4 - r2));
X = apt*X;
Y = apt*Y;
angle = atan2(Y, X);
@@ -879,76 +927,61 @@ void icvCalcOrientation(
angle *= 180.0f / CV_PI_F;
}
s_X[i] = X;
s_Y[i] = Y;
s_angle[i] = angle;
}
s_X[tid] = X;
s_Y[tid] = Y;
s_angle[tid] = angle;
barrier(CLK_LOCAL_MEM_FENCE);
float bestx = 0, besty = 0, best_mod = 0;
float sumx = 0.0f, sumy = 0.0f;
const int dir = tid * ORI_SEARCH_INC;
#pragma unroll
for (int i = 0; i < ORI_SAMPLES; ++i) {
int angle = round(s_angle[i]);
#pragma unroll
for (int i = 0; i < 18; ++i)
{
const int dir = (i * 4 + get_local_id(1)) * ORI_SEARCH_INC;
int d = abs(angle - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[i];
sumy += s_Y[i];
}
}
s_sumx[tid] = sumx;
s_sumy[tid] = sumy;
s_mod[tid] = sumx*sumx + sumy*sumy;
barrier(CLK_LOCAL_MEM_FENCE);
volatile float sumx = 0.0f, sumy = 0.0f;
int d = abs(convert_int_rte(s_angle[get_local_id(0)]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx = s_X[get_local_id(0)];
sumy = s_Y[get_local_id(0)];
}
d = abs(convert_int_rte(s_angle[get_local_id(0) + 32]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[get_local_id(0) + 32];
sumy += s_Y[get_local_id(0) + 32];
}
d = abs(convert_int_rte(s_angle[get_local_id(0) + 64]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[get_local_id(0) + 64];
sumy += s_Y[get_local_id(0) + 64];
}
d = abs(convert_int_rte(s_angle[get_local_id(0) + 96]) - dir);
if (d < ORI_WIN / 2 || d > 360 - ORI_WIN / 2)
{
sumx += s_X[get_local_id(0) + 96];
sumy += s_Y[get_local_id(0) + 96];
}
reduce_32_sum(s_sumx + get_local_id(1) * 32, &sumx, get_local_id(0));
reduce_32_sum(s_sumy + get_local_id(1) * 32, &sumy, get_local_id(0));
const float temp_mod = sumx * sumx + sumy * sumy;
if (temp_mod > best_mod)
{
best_mod = temp_mod;
bestx = sumx;
besty = sumy;
// This reduction searches for the longest wavelet response vector. The first
// step uses all of the work items in the workgroup to narrow the search
// down to the three candidates. It requires s_mod to have a few more
// elements alocated past the work-group size, which are pre-initialized to
// 0.0f above.
for(int t = ORI_RESPONSE_REDUCTION_WIDTH; t >= 3; t /= 2) {
if (tid < t) {
if (s_mod[tid] < s_mod[tid + t]) {
s_mod[tid] = s_mod[tid + t];
s_sumx[tid] = s_sumx[tid + t];
s_sumy[tid] = s_sumy[tid + t];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_local_id(0) == 0)
{
s_X[get_local_id(1)] = bestx;
s_Y[get_local_id(1)] = besty;
s_angle[get_local_id(1)] = best_mod;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(1) == 0 && get_local_id(0) == 0)
// Do the final reduction and write out the result.
if (tid == 0)
{
int bestIdx = 0;
if (s_angle[1] > s_angle[bestIdx])
// The loop above narrowed the search of the longest vector to three
// possibilities. Pick the best here.
if (s_mod[1] > s_mod[bestIdx])
bestIdx = 1;
if (s_angle[2] > s_angle[bestIdx])
if (s_mod[2] > s_mod[bestIdx])
bestIdx = 2;
if (s_angle[3] > s_angle[bestIdx])
bestIdx = 3;
float kp_dir = atan2(s_Y[bestIdx], s_X[bestIdx]);
float kp_dir = atan2(s_sumy[bestIdx], s_sumx[bestIdx]);
if (kp_dir < 0)
kp_dir += 2.0f * CV_PI_F;
kp_dir *= 180.0f / CV_PI_F;
@@ -961,7 +994,6 @@ void icvCalcOrientation(
}
}
__kernel
void icvSetUpright(
__global float * keypoints,
@@ -1035,8 +1067,8 @@ inline float linearFilter(
float out = 0.0f;
const int x1 = convert_int_rtn(x);
const int y1 = convert_int_rtn(y);
const int x1 = round(x);
const int y1 = round(y);
const int x2 = x1 + 1;
const int y2 = y1 + 1;

View File

@@ -46,6 +46,7 @@
#ifdef HAVE_OPENCV_OCL
#include <cstdio>
#include <sstream>
#include "opencl_kernels.hpp"
using namespace cv;
@@ -57,18 +58,25 @@ namespace cv
{
namespace ocl
{
// The number of degrees between orientation samples in calcOrientation
const static int ORI_SEARCH_INC = 5;
// The local size of the calcOrientation kernel
const static int ORI_LOCAL_SIZE = (360 / ORI_SEARCH_INC);
static void openCLExecuteKernelSURF(Context *clCxt, const cv::ocl::ProgramEntry* source, String kernelName, size_t globalThreads[3],
size_t localThreads[3], std::vector< std::pair<size_t, const void *> > &args, int channels, int depth)
{
char optBuf [100] = {0};
char * optBufPtr = optBuf;
std::stringstream optsStr;
optsStr << "-D ORI_LOCAL_SIZE=" << ORI_LOCAL_SIZE << " ";
optsStr << "-D ORI_SEARCH_INC=" << ORI_SEARCH_INC << " ";
cl_kernel kernel;
kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optBufPtr);
kernel = openCLGetKernelFromSource(clCxt, source, kernelName, optsStr.str().c_str());
size_t wave_size = queryWaveFrontSize(kernel);
CV_Assert(clReleaseKernel(kernel) == CL_SUCCESS);
sprintf(optBufPtr, "-D WAVE_SIZE=%d", static_cast<int>(wave_size));
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optBufPtr);
optsStr << "-D WAVE_SIZE=" << wave_size;
openCLExecuteKernel(clCxt, source, kernelName, globalThreads, localThreads, args, channels, depth, optsStr.str().c_str());
}
}
}
@@ -601,8 +609,8 @@ void SURF_OCL_Invoker::icvCalcOrientation_gpu(const oclMat &keypoints, int nFeat
args.push_back( std::make_pair( sizeof(cl_int), (void *)&img_cols));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&surf_.sum.step));
size_t localThreads[3] = {32, 4, 1};
size_t globalThreads[3] = {nFeatures *localThreads[0], localThreads[1], 1};
size_t localThreads[3] = {ORI_LOCAL_SIZE, 1, 1};
size_t globalThreads[3] = {nFeatures * localThreads[0], 1, 1};
openCLExecuteKernelSURF(clCxt, &surfprog, kernelName, globalThreads, localThreads, args, -1, -1);
}

View File

@@ -287,7 +287,7 @@ ocl::createSeparableLinearFilter_GPU
----------------------------------------
Creates a separable linear filter engine.
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT)
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel, const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
:param srcType: Source array type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` source types are supported.
@@ -303,6 +303,8 @@ Creates a separable linear filter engine.
:param bordertype: Pixel extrapolation method.
:param imgSize: Source image size to choose optimal method for processing.
.. seealso:: :ocv:func:`ocl::getLinearRowFilter_GPU`, :ocv:func:`ocl::getLinearColumnFilter_GPU`, :ocv:func:`createSeparableLinearFilter`
@@ -334,7 +336,7 @@ ocl::createDerivFilter_GPU
------------------------------
Creates a filter engine for the generalized Sobel operator.
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT )
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
:param srcType: Source image type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` source types are supported.
@@ -348,6 +350,8 @@ Creates a filter engine for the generalized Sobel operator.
:param borderType: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`.
:param imgSize: Source image size to choose optimal method for processing.
.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createDerivFilter`
@@ -405,7 +409,7 @@ ocl::createGaussianFilter_GPU
---------------------------------
Creates a Gaussian filter engine.
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT)
.. ocv:function:: Ptr<FilterEngine_GPU> ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1) )
:param type: Source and destination image type. ``CV_8UC1`` , ``CV_8UC4`` , ``CV_16SC1`` , ``CV_16SC2`` , ``CV_16SC3`` , ``CV_32SC1`` , ``CV_32FC1`` are supported.
@@ -417,6 +421,8 @@ Creates a Gaussian filter engine.
:param bordertype: Pixel extrapolation method. For details, see :ocv:func:`borderInterpolate`.
:param imgSize: Source image size to choose optimal method for processing.
.. seealso:: :ocv:func:`ocl::createSeparableLinearFilter_GPU`, :ocv:func:`createGaussianFilter`
ocl::GaussianBlur

View File

@@ -695,17 +695,17 @@ namespace cv
//! returns the separable linear filter engine
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat &rowKernel,
const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT);
const Mat &columnKernel, const Point &anchor = Point(-1, -1), double delta = 0.0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
//! returns the separable filter engine with the specified filters
CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU> &rowFilter,
const Ptr<BaseColumnFilter_GPU> &columnFilter);
//! returns the Gaussian filter engine
CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT);
CV_EXPORTS Ptr<FilterEngine_GPU> createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2 = 0, int bordertype = BORDER_DEFAULT, Size imgSize = Size(-1,-1));
//! returns filter engine for the generalized Sobel operator
CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT );
CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU( int srcType, int dstType, int dx, int dy, int ksize, int borderType = BORDER_DEFAULT, Size imgSize = Size(-1,-1) );
//! applies Laplacian operator to the image
// supports only ksize = 1 and ksize = 3
@@ -1439,8 +1439,10 @@ namespace cv
oclMat Dx_;
oclMat Dy_;
oclMat eig_;
oclMat eig_minmax_;
oclMat minMaxbuf_;
oclMat tmpCorners_;
oclMat counter_;
};
inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_,

View File

@@ -56,8 +56,19 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
{
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
int pixels_per_work_item = 1;
String build_options = format("-D DEPTH_%d", src.depth());
if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
{
if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
pixels_per_work_item = 4;
else if (src.cols % 2 == 0)
pixels_per_work_item = 2;
else
pixels_per_work_item = 1;
}
String build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), src.oclchannels(), bidx, pixels_per_work_item);
if (!additionalOptions.empty())
build_options = build_options + additionalOptions;
@@ -66,7 +77,6 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -77,6 +87,73 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
if (!data2.empty())
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data ));
size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
#else
size_t lt[3] = { 16, 16, 1 };
#endif
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void toHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(),
const oclMat & data1 = oclMat(), const oclMat & data2 = oclMat())
{
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
std::string build_options = format("-D DEPTH_%d -D scn=%d -D bidx=%d", src.depth(), src.oclchannels(), bidx);
if (!additionalOptions.empty())
build_options += additionalOptions;
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
if (!data1.empty())
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data1.data ));
if (!data2.empty())
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data2.data ));
size_t gt[3] = { dst.cols, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
#else
size_t lt[3] = { 16, 16, 1 };
#endif
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void fromGray_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
{
std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
if (!additionalOptions.empty())
build_options += additionalOptions;
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
if (!data.empty())
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
size_t gt[3] = { dst.cols, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
@@ -89,7 +166,50 @@ static void fromRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::
static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
{
String build_options = format("-D DEPTH_%d -D dcn=%d", src.depth(), dst.channels());
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
int pixels_per_work_item = 1;
if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
{
if ((src.cols % 4 == 0) && (src.depth() == CV_8U))
pixels_per_work_item = 4;
else if (src.cols % 2 == 0)
pixels_per_work_item = 2;
else
pixels_per_work_item = 1;
}
std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d -D pixels_per_work_item=%d", src.depth(), dst.channels(), bidx, pixels_per_work_item);
if (!additionalOptions.empty())
build_options += additionalOptions;
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
if (!data.empty())
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
size_t gt[3] = { dst.cols/pixels_per_work_item, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
#else
size_t lt[3] = { 16, 16, 1 };
#endif
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void toRGB_NV12_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
{
String build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
if (!additionalOptions.empty())
build_options = build_options + additionalOptions;
@@ -101,7 +221,6 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -119,10 +238,13 @@ static void toRGB_caller(const oclMat &src, oclMat &dst, int bidx, const std::st
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
static void fromHSV_caller(const oclMat &src, oclMat &dst, int bidx, const std::string & kernelName,
const std::string & additionalOptions = std::string(), const oclMat & data = oclMat())
{
String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s", src.depth(),
dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
std::string build_options = format("-D DEPTH_%d -D dcn=%d -D bidx=%d", src.depth(), dst.channels(), bidx);
if (!additionalOptions.empty())
build_options += additionalOptions;
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
@@ -136,6 +258,36 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
if (!data.empty())
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&data.data ));
size_t gt[3] = { dst.cols, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
#else
size_t lt[3] = { 16, 16, 1 };
#endif
openCLExecuteKernel(src.clCxt, &cvt_color, kernelName.c_str(), gt, lt, args, -1, -1, build_options.c_str());
}
static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
{
int src_offset = src.offset / src.elemSize1(), src_step = src.step1();
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step1();
String build_options = format("-D DEPTH_%d -D dcn=%d -D scn=%d -D %s",
src.depth(), dst.channels(), src.channels(), reverse ? "REVERSE" : "ORDER");
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_offset ));
size_t gt[3] = { dst.cols, dst.rows, 1 };
#ifdef ANDROID
size_t lt[3] = { 16, 10, 1 };
@@ -147,8 +299,8 @@ static void RGB_caller(const oclMat &src, oclMat &dst, bool reverse)
static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
{
String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d",
src.depth(), greenbits, dst.channels());
String build_options = format("-D DEPTH_%d -D greenbits=%d -D dcn=%d -D bidx=%d",
src.depth(), greenbits, dst.channels(), bidx);
int src_offset = src.offset >> 1, src_step = src.step >> 1;
int dst_offset = dst.offset / dst.elemSize1(), dst_step = dst.step / dst.elemSize1();
@@ -157,7 +309,6 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -174,8 +325,8 @@ static void fromRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int gree
static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenbits, const std::string & kernelName)
{
String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d",
src.depth(), greenbits, src.channels());
String build_options = format("-D DEPTH_%d -D greenbits=%d -D scn=%d -D bidx=%d",
src.depth(), greenbits, src.channels(), bidx);
int src_offset = (int)src.offset, src_step = (int)src.step;
int dst_offset = dst.offset >> 1, dst_step = dst.step >> 1;
@@ -184,7 +335,6 @@ static void toRGB5x5_caller(const oclMat &src, oclMat &dst, int bidx, int greenb
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst_step));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&bidx));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset ));
@@ -272,7 +422,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
CV_Assert(scn == 1);
dcn = code == COLOR_GRAY2BGRA ? 4 : 3;
dst.create(sz, CV_MAKETYPE(depth, dcn));
toRGB_caller(src, dst, 0, "Gray2RGB");
fromGray_caller(src, dst, 0, "Gray2RGB");
break;
}
case COLOR_BGR2YUV: case COLOR_RGB2YUV:
@@ -303,7 +453,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
Size dstSz(sz.width, sz.height * 2 / 3);
dst.create(dstSz, CV_MAKETYPE(depth, dcn));
toRGB_caller(src, dst, bidx, "YUV2RGBA_NV12");
toRGB_NV12_caller(src, dst, bidx, "YUV2RGBA_NV12");
break;
}
case COLOR_BGR2YCrCb: case COLOR_RGB2YCrCb:
@@ -460,11 +610,11 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
initialized = true;
}
fromRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
toHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d", hrange), sdiv_data, hrange == 256 ? hdiv_data256 : hdiv_data180);
return;
}
fromRGB_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
toHSV_caller(src, dst, bidx, kernelName, format(" -D hscale=%f", hrange*(1.f/360.f)));
break;
}
case COLOR_HSV2BGR: case COLOR_HSV2RGB: case COLOR_HSV2BGR_FULL: case COLOR_HSV2RGB_FULL:
@@ -483,7 +633,7 @@ static void cvtColor_caller(const oclMat &src, oclMat &dst, int code, int dcn)
dst.create(sz, CV_MAKETYPE(depth, dcn));
std::string kernelName = std::string(is_hsv ? "HSV" : "HLS") + "2RGB";
toRGB_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
fromHSV_caller(src, dst, bidx, kernelName, format(" -D hrange=%d -D hscale=%f", hrange, 6.f/hrange));
break;
}
case COLOR_RGBA2mRGBA: case COLOR_mRGBA2RGBA:

View File

@@ -741,6 +741,135 @@ void cv::ocl::filter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &ke
f->apply(src, dst);
}
const int optimizedSepFilterLocalSize = 16;
static void sepFilter2D_SinglePass(const oclMat &src, oclMat &dst,
const Mat &row_kernel, const Mat &col_kernel, int bordertype = BORDER_DEFAULT)
{
size_t lt2[3] = {optimizedSepFilterLocalSize, optimizedSepFilterLocalSize, 1};
size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1};
unsigned int src_pitch = src.step;
unsigned int dst_pitch = dst.step;
int src_offset_x = (src.offset % src.step) / src.elemSize();
int src_offset_y = src.offset / src.step;
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst.data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.offset ));
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholecols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholerows ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.cols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&dst.rows ));
String option = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d",(int)lt2[0], (int)lt2[1],
row_kernel.rows / 2, col_kernel.rows / 2 );
option += " -D KERNEL_MATRIX_X=";
for(int i=0; i<row_kernel.rows; i++)
option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &row_kernel.at<float>(i) ) );
option += "0x0";
option += " -D KERNEL_MATRIX_Y=";
for(int i=0; i<col_kernel.rows; i++)
option += cv::format("0x%x,", *reinterpret_cast<const unsigned int*>( &col_kernel.at<float>(i) ) );
option += "0x0";
switch(src.type())
{
case CV_8UC1:
option += " -D SRCTYPE=uchar -D CONVERT_SRCTYPE=convert_float -D WORKTYPE=float";
break;
case CV_32FC1:
option += " -D SRCTYPE=float -D CONVERT_SRCTYPE= -D WORKTYPE=float";
break;
case CV_8UC2:
option += " -D SRCTYPE=uchar2 -D CONVERT_SRCTYPE=convert_float2 -D WORKTYPE=float2";
break;
case CV_32FC2:
option += " -D SRCTYPE=float2 -D CONVERT_SRCTYPE= -D WORKTYPE=float2";
break;
case CV_8UC3:
option += " -D SRCTYPE=uchar3 -D CONVERT_SRCTYPE=convert_float3 -D WORKTYPE=float3";
break;
case CV_32FC3:
option += " -D SRCTYPE=float3 -D CONVERT_SRCTYPE= -D WORKTYPE=float3";
break;
case CV_8UC4:
option += " -D SRCTYPE=uchar4 -D CONVERT_SRCTYPE=convert_float4 -D WORKTYPE=float4";
break;
case CV_32FC4:
option += " -D SRCTYPE=float4 -D CONVERT_SRCTYPE= -D WORKTYPE=float4";
break;
default:
CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
break;
}
switch(dst.type())
{
case CV_8UC1:
option += " -D DSTTYPE=uchar -D CONVERT_DSTTYPE=convert_uchar_sat";
break;
case CV_8UC2:
option += " -D DSTTYPE=uchar2 -D CONVERT_DSTTYPE=convert_uchar2_sat";
break;
case CV_8UC3:
option += " -D DSTTYPE=uchar3 -D CONVERT_DSTTYPE=convert_uchar3_sat";
break;
case CV_8UC4:
option += " -D DSTTYPE=uchar4 -D CONVERT_DSTTYPE=convert_uchar4_sat";
break;
case CV_32FC1:
option += " -D DSTTYPE=float -D CONVERT_DSTTYPE=";
break;
case CV_32FC2:
option += " -D DSTTYPE=float2 -D CONVERT_DSTTYPE=";
break;
case CV_32FC3:
option += " -D DSTTYPE=float3 -D CONVERT_DSTTYPE=";
break;
case CV_32FC4:
option += " -D DSTTYPE=float4 -D CONVERT_DSTTYPE=";
break;
default:
CV_Error(CV_StsUnsupportedFormat, "Image type is not supported!");
break;
}
switch(bordertype)
{
case cv::BORDER_CONSTANT:
option += " -D BORDER_CONSTANT";
break;
case cv::BORDER_REPLICATE:
option += " -D BORDER_REPLICATE";
break;
case cv::BORDER_REFLECT:
option += " -D BORDER_REFLECT";
break;
case cv::BORDER_REFLECT101:
option += " -D BORDER_REFLECT_101";
break;
case cv::BORDER_WRAP:
option += " -D BORDER_WRAP";
break;
default:
CV_Error(CV_StsBadFlag, "BORDER type is not supported!");
break;
}
openCLExecuteKernel(src.clCxt, &filtering_sep_filter_singlepass, "sep_filter_singlepass", gt2, lt2, args,
-1, -1, option.c_str() );
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// SeparableFilter
@@ -790,6 +919,35 @@ Ptr<FilterEngine_GPU> cv::ocl::createSeparableFilter_GPU(const Ptr<BaseRowFilter
return makePtr<SeparableFilterEngine_GPU>(rowFilter, columnFilter);
}
namespace
{
class SingleStepSeparableFilterEngine_GPU : public FilterEngine_GPU
{
public:
SingleStepSeparableFilterEngine_GPU( const Mat &rowKernel_, const Mat &columnKernel_, const int btype )
{
bordertype = btype;
rowKernel = rowKernel_;
columnKernel = columnKernel_;
}
virtual void apply(const oclMat &src, oclMat &dst, Rect roi = Rect(0, 0, -1, -1))
{
normalizeROI(roi, Size(rowKernel.rows, columnKernel.rows), Point(-1,-1), src.size());
oclMat srcROI = src(roi);
oclMat dstROI = dst(roi);
sepFilter2D_SinglePass(src, dst, rowKernel, columnKernel, bordertype);
}
Mat rowKernel;
Mat columnKernel;
int bordertype;
};
}
static void GPUFilterBox(const oclMat &src, oclMat &dst,
Size &ksize, const Point anchor, const int borderType)
{
@@ -1243,17 +1401,32 @@ Ptr<BaseColumnFilter_GPU> cv::ocl::getLinearColumnFilter_GPU(int /*bufType*/, in
}
Ptr<FilterEngine_GPU> cv::ocl::createSeparableLinearFilter_GPU(int srcType, int dstType,
const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype)
const Mat &rowKernel, const Mat &columnKernel, const Point &anchor, double delta, int bordertype, Size imgSize )
{
int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
int cn = CV_MAT_CN(srcType);
int bdepth = std::max(std::max(sdepth, ddepth), CV_32F);
int bufType = CV_MAKETYPE(bdepth, cn);
Context* clCxt = Context::getContext();
Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
//if image size is non-degenerate and large enough
//and if filter support is reasonable to satisfy larger local memory requirements,
//then we can use single pass routine to avoid extra runtime calls overhead
if( clCxt && clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) &&
rowKernel.rows <= 21 && columnKernel.rows <= 21 &&
(rowKernel.rows & 1) == 1 && (columnKernel.rows & 1) == 1 &&
imgSize.width > optimizedSepFilterLocalSize + (rowKernel.rows>>1) &&
imgSize.height > optimizedSepFilterLocalSize + (columnKernel.rows>>1) )
{
return Ptr<FilterEngine_GPU>(new SingleStepSeparableFilterEngine_GPU(rowKernel, columnKernel, bordertype));
}
else
{
Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x, bordertype);
Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y, bordertype, delta);
return createSeparableFilter_GPU(rowFilter, columnFilter);
return createSeparableFilter_GPU(rowFilter, columnFilter);
}
}
void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat &kernelX, const Mat &kernelY, Point anchor, double delta, int bordertype)
@@ -1277,16 +1450,16 @@ void cv::ocl::sepFilter2D(const oclMat &src, oclMat &dst, int ddepth, const Mat
dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype);
Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, delta, bordertype, src.size());
f->apply(src, dst);
}
Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType)
Ptr<FilterEngine_GPU> cv::ocl::createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize, int borderType, Size imgSize )
{
Mat kx, ky;
getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
return createSeparableLinearFilter_GPU(srcType, dstType,
kx, ky, Point(-1, -1), 0, borderType);
kx, ky, Point(-1, -1), 0, borderType, imgSize);
}
////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1356,7 +1529,7 @@ void cv::ocl::Laplacian(const oclMat &src, oclMat &dst, int ddepth, int ksize, d
////////////////////////////////////////////////////////////////////////////////////////////////////
// Gaussian Filter
Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype)
Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, double sigma1, double sigma2, int bordertype, Size imgSize)
{
int depth = CV_MAT_DEPTH(type);
@@ -1383,7 +1556,7 @@ Ptr<FilterEngine_GPU> cv::ocl::createGaussianFilter_GPU(int type, Size ksize, do
else
ky = getGaussianKernel(ksize.height, sigma2, std::max(depth, CV_32F));
return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype);
return createSeparableLinearFilter_GPU(type, type, kx, ky, Point(-1, -1), 0.0, bordertype, imgSize);
}
void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double sigma1, double sigma2, int bordertype)
@@ -1419,7 +1592,7 @@ void cv::ocl::GaussianBlur(const oclMat &src, oclMat &dst, Size ksize, double si
dst.create(src.size(), src.type());
Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype);
Ptr<FilterEngine_GPU> f = createGaussianFilter_GPU(src.type(), ksize, sigma1, sigma2, bordertype, src.size());
f->apply(src, dst);
}

View File

@@ -48,154 +48,142 @@
using namespace cv;
using namespace cv::ocl;
// currently sort procedure on the host is more efficient
static bool use_cpu_sorter = true;
namespace
// compact structure for corners
struct DefCorner
{
enum SortMethod
float eig; //eigenvalue of corner
short x; //x coordinate of corner point
short y; //y coordinate of corner point
} ;
// compare procedure for corner
//it is used for sort on the host side
struct DefCornerCompare
{
CPU_STL,
BITONIC,
SELECTION
};
const int GROUP_SIZE = 256;
template<SortMethod method>
struct Sorter
{
//typedef EigType;
};
//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed.
template<>
struct Sorter<CPU_STL>
{
typedef oclMat EigType;
static cv::Mutex cs;
static Mat mat_eig;
//prototype
static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2)
bool operator()(const DefCorner a, const DefCorner b) const
{
float v1 = mat_eig.at<float>(cvRound(pt1.s[1]), cvRound(pt1.s[0]));
float v2 = mat_eig.at<float>(cvRound(pt2.s[1]), cvRound(pt2.s[0]));
return v1 > v2;
}
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
cv::AutoLock lock(cs);
//temporarily use STL's sort function
Mat mat_corners = corners;
mat_eig = eig_tex;
std::sort(mat_corners.begin<cl_float2>(), mat_corners.begin<cl_float2>() + count, clfloat2Gt);
corners = mat_corners;
return a.eig > b.eig;
}
};
cv::Mutex Sorter<CPU_STL>::cs;
cv::Mat Sorter<CPU_STL>::mat_eig;
template<>
struct Sorter<BITONIC>
// sort corner point using opencl bitonicosrt implementation
static void sortCorners_caller(oclMat& corners, const int count)
{
typedef TextureCL EigType;
Context * cxt = Context::getContext();
int GS = count/2;
int LS = min(255,GS);
size_t globalThreads[3] = {GS, 1, 1};
size_t localThreads[3] = {LS, 1, 1};
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
// 2^numStages should be equal to count or the output is invalid
int numStages = 0;
for(int i = count; i > 1; i >>= 1)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count / 2, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
// 2^numStages should be equal to count or the output is invalid
int numStages = 0;
for(int i = count; i > 1; i >>= 1)
++numStages;
}
const int argc = 4;
std::vector< std::pair<size_t, const void *> > args(argc);
std::string kernelname = "sortCorners_bitonicSort";
args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
args[1] = std::make_pair(sizeof(cl_int), (void *)&count);
for(int stage = 0; stage < numStages; ++stage)
{
args[2] = std::make_pair(sizeof(cl_int), (void *)&stage);
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
{
++numStages;
}
const int argc = 5;
std::vector< std::pair<size_t, const void *> > args(argc);
String kernelname = "sortCorners_bitonicSort";
args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex);
args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data);
args[2] = std::make_pair(sizeof(cl_int), (void *)&count);
for(int stage = 0; stage < numStages; ++stage)
{
args[3] = std::make_pair(sizeof(cl_int), (void *)&stage);
for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage)
{
args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
}
args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage);
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
}
}
};
}
template<>
struct Sorter<SELECTION>
{
typedef TextureCL EigType;
static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count)
{
Context * cxt = Context::getContext();
size_t globalThreads[3] = {count, 1, 1};
size_t localThreads[3] = {GROUP_SIZE, 1, 1};
std::vector< std::pair<size_t, const void *> > args;
//local
String kernelname = "sortCorners_selectionSortLocal";
int lds_size = GROUP_SIZE * sizeof(cl_float2);
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) );
args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) );
args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) );
args.push_back( std::make_pair( lds_size, (void*)NULL) );
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
//final
kernelname = "sortCorners_selectionSortFinal";
args.pop_back();
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1);
}
};
int findCorners_caller(
const TextureCL& eig,
const float threshold,
const oclMat& mask,
oclMat& corners,
const int max_count)
// find corners on matrix and put it into array
static void findCorners_caller(
const oclMat& eig_mat, //input matrix worth eigenvalues
oclMat& eigMinMax, //input with min and max values of eigenvalues
const float qualityLevel,
const oclMat& mask,
oclMat& corners, //output array with detected corners
oclMat& counter) //output value with number of detected corners, have to be 0 before call
{
String opt;
std::vector<int> k;
Context * cxt = Context::getContext();
std::vector< std::pair<size_t, const void*> > args;
String kernelname = "findCorners";
const int mask_strip = mask.step / mask.elemSize1();
oclMat g_counter(1, 1, CV_32SC1);
g_counter.setTo(0);
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&(eig_mat.data)));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&eig ));
int src_pitch = (int)eig_mat.step;
args.push_back(std::make_pair( sizeof(cl_int), (void*)&src_pitch ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&mask.data ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&corners.data ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&mask_strip));
args.push_back(std::make_pair( sizeof(cl_float), (void*)&threshold ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.rows ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig.cols ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&max_count ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&g_counter.data ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&eigMinMax.data ));
args.push_back(std::make_pair( sizeof(cl_float), (void*)&qualityLevel ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig_mat.rows ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&eig_mat.cols ));
args.push_back(std::make_pair( sizeof(cl_int), (void*)&corners.cols ));
args.push_back(std::make_pair( sizeof(cl_mem), (void*)&counter.data ));
size_t globalThreads[3] = {eig.cols, eig.rows, 1};
size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1};
size_t localThreads[3] = {16, 16, 1};
if(!mask.empty())
opt += " -D WITH_MASK=1";
const char * opt = mask.empty() ? "" : "-D WITH_MASK";
openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt);
return std::min(Mat(g_counter).at<int>(0), max_count);
openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str());
}
static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero)
{
size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits;
CV_Assert(groupnum != 0);
int dbsize = groupnum * 2 * src.elemSize();
ensureSizeIsEnough(1, dbsize, CV_8UC1, dst);
cl_mem dst_data = reinterpret_cast<cl_mem>(dst.data);
int all_cols = src.step / src.elemSize();
int pre_cols = (src.offset % src.step) / src.elemSize();
int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1;
int invalid_cols = pre_cols + sec_cols;
int cols = all_cols - invalid_cols , elemnum = cols * src.rows;
int offset = src.offset / src.elemSize();
{// first parallel pass
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&cols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&invalid_cols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&offset));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&elemnum));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum));
size_t globalThreads[3] = {groupnum * 256, 1, 1};
size_t localThreads[3] = {256, 1, 1};
openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads,
args, -1, -1, "-D T=float -D DEPTH_5");
}
{// run final "serial" kernel to find accumulate results from threads and reset corner counter
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&dst_data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&groupnum ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&tozero.data ));
size_t globalThreads[3] = {1, 1, 1};
size_t localThreads[3] = {1, 1, 1};
openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads,
args, -1, -1);
}
}
}//unnamed namespace
void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask)
{
@@ -205,67 +193,99 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
ensureSizeIsEnough(image.size(), CV_32F, eig_);
if (useHarrisDetector)
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK);
else
cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3);
double maxVal = 0;
minMax(eig_, NULL, &maxVal);
ensureSizeIsEnough(1,1, CV_32SC1, counter_);
ensureSizeIsEnough(1, std::max(1000, static_cast<int>(image.size().area() * 0.05)), CV_32FC2, tmpCorners_);
// find max eigenvalue and reset detected counters
minMaxEig_caller(eig_,eig_minmax_,counter_);
Ptr<TextureCL> eig_tex = bindTexturePtr(eig_);
int total = findCorners_caller(
*eig_tex,
static_cast<float>(maxVal * qualityLevel),
// allocate buffer for kernels
int corner_array_size = std::max(1024, static_cast<int>(image.size().area() * 0.05));
if(!use_cpu_sorter)
{ // round to 2^n
unsigned int n=1;
for(n=1;n<(unsigned int)corner_array_size;n<<=1);
corner_array_size = (int)n;
ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
// set to 0 to be able use bitonic sort on whole 2^n array
tmpCorners_.setTo(0);
}
else
{
ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_);
}
int total = tmpCorners_.cols; // by default the number of corner is full array
std::vector<DefCorner> tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm
//find points with high eigenvalue and put it into the output array
findCorners_caller(
eig_,
eig_minmax_,
static_cast<float>(qualityLevel),
mask,
tmpCorners_,
tmpCorners_.cols);
counter_);
if(!use_cpu_sorter)
{// sort detected corners on deivce side
sortCorners_caller(tmpCorners_, corner_array_size);
}
else
{// send non-blocking request to read real non-zero number of corners to sort it on the HOST side
openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL));
}
//blocking read whole corners array (sorted or not sorted)
openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner));
if (total == 0)
{
{// check for trivial case
corners.release();
return;
}
if(use_cpu_sorter)
{
Sorter<CPU_STL>::sortCorners_caller(eig_, tmpCorners_, total);
}
else
{
//if total is power of 2
if(((total - 1) & (total)) == 0)
{
Sorter<BITONIC>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
else
{
Sorter<SELECTION>::sortCorners_caller(*eig_tex, tmpCorners_, total);
}
{// sort detected corners on cpu side.
tmp.resize(total);
std::sort(tmp.begin(), tmp.end(), DefCornerCompare());
}
//estimate maximal size of final output array
int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total;
int D2 = (int)ceil(minDistance * minDistance);
// allocate output buffer
std::vector<Point2f> tmp2;
tmp2.reserve(total_max);
if (minDistance < 1)
{
Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1);
tmpCorners_(roi_range).copyTo(corners);
{// we have not distance restriction. then just copy with conversion maximal allowed points into output array
for(int i=0;i<total_max && tmp[i].eig>0.0f;++i)
{
tmp2.push_back(Point2f(tmp[i].x,tmp[i].y));
}
}
else
{
std::vector<Point2f> tmp(total);
downloadPoints(tmpCorners_, tmp);
std::vector<Point2f> tmp2;
tmp2.reserve(total);
{// we have distance restriction. then start coping to output array from the first element and check distance for each next one
const int cell_size = cvRound(minDistance);
const int grid_width = (image.cols + cell_size - 1) / cell_size;
const int grid_height = (image.rows + cell_size - 1) / cell_size;
std::vector< std::vector<Point2f> > grid(grid_width * grid_height);
std::vector< std::vector<Point2i> > grid(grid_width * grid_height);
for (int i = 0; i < total; ++i)
for (int i = 0; i < total ; ++i)
{
Point2f p = tmp[i];
DefCorner p = tmp[i];
if(p.eig<=0.0f)
break; // condition to stop that is needed for GPU bitonic sort usage.
bool good = true;
@@ -287,40 +307,42 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image,
{
for (int xx = x1; xx <= x2; xx++)
{
std::vector<Point2f>& m = grid[yy * grid_width + xx];
if (!m.empty())
std::vector<Point2i>& m = grid[yy * grid_width + xx];
if (m.empty())
continue;
for(size_t j = 0; j < m.size(); j++)
{
for(size_t j = 0; j < m.size(); j++)
{
float dx = p.x - m[j].x;
float dy = p.y - m[j].y;
int dx = p.x - m[j].x;
int dy = p.y - m[j].y;
if (dx * dx + dy * dy < minDistance * minDistance)
{
good = false;
goto break_out;
}
if (dx * dx + dy * dy < D2)
{
good = false;
goto break_out_;
}
}
}
}
break_out:
break_out_:
if(good)
{
grid[y_cell * grid_width + x_cell].push_back(p);
grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y));
tmp2.push_back(p);
tmp2.push_back(Point2f(p.x,p.y));
if (maxCorners > 0 && tmp2.size() == static_cast<size_t>(maxCorners))
break;
}
}
corners.upload(Mat(1, static_cast<int>(tmp2.size()), CV_32FC2, &tmp2[0]));
}
int final_size = static_cast<int>(tmp2.size());
if(final_size>0)
corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0]));
else
corners.release();
}
void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, std::vector<Point2f> &points_v)
{

View File

@@ -866,16 +866,17 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
if(gcascade->is_stump_based && gsum.clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE))
{
//setup local group size
localThreads[0] = 8;
localThreads[1] = 16;
//setup local group size for "pixel step" = 1
localThreads[0] = 16;
localThreads[1] = 32;
localThreads[2] = 1;
//init maximal number of workgroups
//calc maximal number of workgroups
int WGNumX = 1+(sizev[0].width /(localThreads[0]));
int WGNumY = 1+(sizev[0].height/(localThreads[1]));
int WGNumZ = loopcount;
int WGNum = 0; //accurate number of non -empty workgroups
int WGNumTotal = 0; //accurate number of non-empty workgroups
int WGNumSampled = 0; //accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2
oclMat oclWGInfo(1,sizeof(cl_int4) * WGNumX*WGNumY*WGNumZ,CV_8U);
{
cl_int4* pWGInfo = (cl_int4*)clEnqueueMapBuffer(getClCommandQueue(oclWGInfo.clCxt),(cl_mem)oclWGInfo.datastart,true,CL_MAP_WRITE, 0, oclWGInfo.step, 0,0,0,&status);
@@ -895,12 +896,16 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
if(gx>=(Width-cascade->orig_window_size.width))
continue; // no data to process
if(scaleinfo[z].factor<=2)
{
WGNumSampled++;
}
// save no-empty workgroup info into array
pWGInfo[WGNum].s[0] = scaleinfo[z].width_height;
pWGInfo[WGNum].s[1] = (gx << 16) | gy;
pWGInfo[WGNum].s[2] = scaleinfo[z].imgoff;
memcpy(&(pWGInfo[WGNum].s[3]),&(scaleinfo[z].factor),sizeof(float));
WGNum++;
pWGInfo[WGNumTotal].s[0] = scaleinfo[z].width_height;
pWGInfo[WGNumTotal].s[1] = (gx << 16) | gy;
pWGInfo[WGNumTotal].s[2] = scaleinfo[z].imgoff;
memcpy(&(pWGInfo[WGNumTotal].s[3]),&(scaleinfo[z].factor),sizeof(float));
WGNumTotal++;
}
}
}
@@ -908,13 +913,8 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
pWGInfo = NULL;
}
// setup global sizes to have linear array of workgroups with WGNum size
globalThreads[0] = localThreads[0]*WGNum;
globalThreads[1] = localThreads[1];
globalThreads[2] = 1;
#define NODE_SIZE 12
// pack node info to have less memory loads
// pack node info to have less memory loads on the device side
oclMat oclNodesPK(1,sizeof(cl_int) * NODE_SIZE * nodenum,CV_8U);
{
cl_int status;
@@ -963,8 +963,6 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
options += format(" -D WND_SIZE_X=%d",cascade->orig_window_size.width);
options += format(" -D WND_SIZE_Y=%d",cascade->orig_window_size.height);
options += format(" -D STUMP_BASED=%d",gcascade->is_stump_based);
options += format(" -D LSx=%d",localThreads[0]);
options += format(" -D LSy=%d",localThreads[1]);
options += format(" -D SPLITNODE=%d",splitnode);
options += format(" -D SPLITSTAGE=%d",splitstage);
options += format(" -D OUTPUTSZ=%d",outputsz);
@@ -972,8 +970,39 @@ void OclCascadeClassifier::detectMultiScale(oclMat &gimg, CV_OUT std::vector<cv:
// init candiate global count by 0
int pattern = 0;
openCLSafeCall(clEnqueueWriteBuffer(qu, candidatebuffer, 1, 0, 1 * sizeof(pattern),&pattern, 0, NULL, NULL));
// execute face detector
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, localThreads, args, -1, -1, options.c_str());
if(WGNumTotal>WGNumSampled)
{// small images and each pixel is processed
// setup global sizes to have linear array of workgroups with WGNum size
int pixelstep = 1;
size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
globalThreads[0] = LS[0]*(WGNumTotal-WGNumSampled);
globalThreads[1] = LS[1];
globalThreads[2] = 1;
String options1 = options;
options1 += format(" -D PIXEL_STEP=%d",pixelstep);
options1 += format(" -D WGSTART=%d",WGNumSampled);
options1 += format(" -D LSx=%d",LS[0]);
options1 += format(" -D LSy=%d",LS[1]);
// execute face detector
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options1.c_str());
}
if(WGNumSampled>0)
{// large images each 4th pixel is processed
// setup global sizes to have linear array of workgroups with WGNum size
int pixelstep = 2;
size_t LS[3]={localThreads[0]/pixelstep,localThreads[1]/pixelstep,1};
globalThreads[0] = LS[0]*WGNumSampled;
globalThreads[1] = LS[1];
globalThreads[2] = 1;
String options2 = options;
options2 += format(" -D PIXEL_STEP=%d",pixelstep);
options2 += format(" -D WGSTART=%d",0);
options2 += format(" -D LSx=%d",LS[0]);
options2 += format(" -D LSy=%d",LS[1]);
// execute face detector
openCLExecuteKernel(gsum.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascadePacked", globalThreads, LS, args, -1, -1, options2.c_str());
}
//read candidate buffer back and put it into host list
openCLReadBuffer( gsum.clCxt, candidatebuffer, candidate, 4 * sizeof(int)*outputsz );
assert(candidate[0]<outputsz);

View File

@@ -76,6 +76,11 @@ namespace cv
int cdescr_width;
int cdescr_height;
// A shift value and type that allows qangle to be different
// sizes on different hardware
int qangle_step_shift;
int qangle_type;
void set_up_constants(int nbins, int block_stride_x, int block_stride_y,
int nblocks_win_x, int nblocks_win_y);
@@ -153,6 +158,7 @@ cv::ocl::HOGDescriptor::HOGDescriptor(Size win_size_, Size block_size_, Size blo
hog_device_cpu = true;
else
hog_device_cpu = false;
}
size_t cv::ocl::HOGDescriptor::getDescriptorSize() const
@@ -213,7 +219,7 @@ void cv::ocl::HOGDescriptor::init_buffer(const oclMat &img, Size win_stride)
effect_size = img.size();
grad.create(img.size(), CV_32FC2);
qangle.create(img.size(), CV_8UC2);
qangle.create(img.size(), hog::qangle_type);
const size_t block_hist_size = getBlockHistogramSize();
const Size blocks_per_img = numPartsWithin(img.size(), block_size, block_stride);
@@ -1606,6 +1612,16 @@ void cv::ocl::device::hog::set_up_constants(int nbins,
int descr_size = descr_width * nblocks_win_y;
cdescr_size = descr_size;
qangle_type = CV_8UC2;
qangle_step_shift = 0;
// Some Intel devices have low single-byte access performance,
// so we change the datatype here.
if (Context::getContext()->supportsFeature(FEATURE_CL_INTEL_DEVICE))
{
qangle_type = CV_32SC2;
qangle_step_shift = 2;
}
}
void cv::ocl::device::hog::compute_hists(int nbins,
@@ -1627,7 +1643,7 @@ void cv::ocl::device::hog::compute_hists(int nbins,
int blocks_total = img_block_width * img_block_height;
int grad_quadstep = grad.step >> 2;
int qangle_step = qangle.step;
int qangle_step = qangle.step >> qangle_step_shift;
int blocks_in_group = 4;
size_t localThreads[3] = { blocks_in_group * 24, 2, 1 };
@@ -1892,7 +1908,7 @@ void cv::ocl::device::hog::compute_gradients_8UC1(int height, int width,
char correctGamma = (correct_gamma) ? 1 : 0;
int img_step = img.step;
int grad_quadstep = grad.step >> 3;
int qangle_step = qangle.step >> 1;
int qangle_step = qangle.step >> (1 + qangle_step_shift);
args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&width));
@@ -1927,7 +1943,7 @@ void cv::ocl::device::hog::compute_gradients_8UC4(int height, int width,
char correctGamma = (correct_gamma) ? 1 : 0;
int img_step = img.step >> 2;
int grad_quadstep = grad.step >> 3;
int qangle_step = qangle.step >> 1;
int qangle_step = qangle.step >> (1 + qangle_step_shift);
args.push_back( std::make_pair( sizeof(cl_int), (void *)&height));
args.push_back( std::make_pair( sizeof(cl_int), (void *)&width));

View File

@@ -1035,67 +1035,117 @@ namespace cv
else
scale = 1. / scale;
if (ksize > 0)
const int sobel_lsz = 16;
if((src.type() == CV_8UC1 || src.type() == CV_32FC1) &&
(ksize==3 || ksize==5 || ksize==7 || ksize==-1) &&
src.wholerows > sobel_lsz + (ksize>>1) &&
src.wholecols > sobel_lsz + (ksize>>1))
{
Context* clCxt = Context::getContext();
if(clCxt->supportsFeature(FEATURE_CL_INTEL_DEVICE) && src.type() == CV_8UC1 &&
src.cols % 8 == 0 && src.rows % 8 == 0 &&
ksize==3 &&
(borderType ==cv::BORDER_REFLECT ||
borderType == cv::BORDER_REPLICATE ||
borderType ==cv::BORDER_REFLECT101 ||
borderType ==cv::BORDER_WRAP))
Dx.create(src.size(), CV_32FC1);
Dy.create(src.size(), CV_32FC1);
CV_Assert(Dx.rows == Dy.rows && Dx.cols == Dy.cols);
size_t lt2[3] = {sobel_lsz, sobel_lsz, 1};
size_t gt2[3] = {lt2[0]*(1 + (src.cols-1) / lt2[0]), lt2[1]*(1 + (src.rows-1) / lt2[1]), 1};
unsigned int src_pitch = src.step;
unsigned int Dx_pitch = Dx.step;
unsigned int Dy_pitch = Dy.step;
int src_offset_x = (src.offset % src.step) / src.elemSize();
int src_offset_y = src.offset / src.step;
float _scale = scale;
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_x ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src_offset_y ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.offset ));
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dx_pitch ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dy.offset ));
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&Dy_pitch ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholecols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.wholerows ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.cols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&Dx.rows ));
args.push_back( std::make_pair( sizeof(cl_float), (void *)&_scale ));
String option = cv::format("-D BLK_X=%d -D BLK_Y=%d",(int)lt2[0],(int)lt2[1]);
switch(src.type())
{
Dx.create(src.size(), CV_32FC1);
Dy.create(src.size(), CV_32FC1);
const unsigned int block_x = 8;
const unsigned int block_y = 8;
unsigned int src_pitch = src.step;
unsigned int dst_pitch = Dx.cols;
float _scale = scale;
std::vector<std::pair<size_t , const void *> > args;
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&src.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dx.data ));
args.push_back( std::make_pair( sizeof(cl_mem) , (void *)&Dy.data ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.cols ));
args.push_back( std::make_pair( sizeof(cl_int) , (void *)&src.rows ));
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&src_pitch ));
args.push_back( std::make_pair( sizeof(cl_uint) , (void *)&dst_pitch ));
args.push_back( std::make_pair( sizeof(cl_float) , (void *)&_scale ));
size_t gt2[3] = {src.cols, src.rows, 1}, lt2[3] = {block_x, block_y, 1};
String option = "-D BLK_X=8 -D BLK_Y=8";
switch(borderType)
{
case cv::BORDER_REPLICATE:
option += " -D BORDER_REPLICATE";
break;
case cv::BORDER_REFLECT:
option += " -D BORDER_REFLECT";
break;
case cv::BORDER_REFLECT101:
option += " -D BORDER_REFLECT101";
break;
case cv::BORDER_WRAP:
option += " -D BORDER_WRAP";
break;
}
openCLExecuteKernel(src.clCxt, &imgproc_sobel3, "sobel3", gt2, lt2, args, -1, -1, option.c_str() );
case CV_8UC1:
option += " -D SRCTYPE=uchar";
break;
case CV_32FC1:
option += " -D SRCTYPE=float";
break;
}
else
switch(borderType)
{
case cv::BORDER_CONSTANT:
option += " -D BORDER_CONSTANT";
break;
case cv::BORDER_REPLICATE:
option += " -D BORDER_REPLICATE";
break;
case cv::BORDER_REFLECT:
option += " -D BORDER_REFLECT";
break;
case cv::BORDER_REFLECT101:
option += " -D BORDER_REFLECT_101";
break;
case cv::BORDER_WRAP:
option += " -D BORDER_WRAP";
break;
default:
CV_Error(CV_StsBadFlag, "BORDER type is not supported!");
break;
}
String kernel_name;
switch(ksize)
{
case -1:
option += " -D SCHARR";
kernel_name = "sobel3";
break;
case 3:
kernel_name = "sobel3";
break;
case 5:
kernel_name = "sobel5";
break;
case 7:
kernel_name = "sobel7";
break;
default:
CV_Error(CV_StsBadFlag, "Kernel size is not supported!");
break;
}
openCLExecuteKernel(src.clCxt, &imgproc_sobel3, kernel_name, gt2, lt2, args, -1, -1, option.c_str() );
}
else
{
if (ksize > 0)
{
Sobel(src, Dx, CV_32F, 1, 0, ksize, scale, 0, borderType);
Sobel(src, Dy, CV_32F, 0, 1, ksize, scale, 0, borderType);
}
}
else
{
Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
else
{
Scharr(src, Dx, CV_32F, 1, 0, scale, 0, borderType);
Scharr(src, Dy, CV_32F, 0, 1, scale, 0, borderType);
}
}
CV_Assert(Dx.offset == 0 && Dy.offset == 0);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,185 @@
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2013, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef BORDER_CONSTANT
//CCCCCC|abcdefgh|CCCCCCC
#define EXTRAPOLATE(x, maxV)
#elif defined BORDER_REPLICATE
//aaaaaa|abcdefgh|hhhhhhh
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = max(min((x), (maxV) - 1), 0); \
}
#elif defined BORDER_WRAP
//cdefgh|abcdefgh|abcdefg
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = ( (x) + (maxV) ) % (maxV); \
}
#elif defined BORDER_REFLECT
//fedcba|abcdefgh|hgfedcb
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
}
#elif defined BORDER_REFLECT_101
//gfedcb|abcdefgh|gfedcba
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
}
#else
#error No extrapolation method
#endif
#define SRC(_x,_y) CONVERT_SRCTYPE(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x])
#ifdef BORDER_CONSTANT
//CCCCCC|abcdefgh|CCCCCCC
#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
#else
#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
#endif
#define DST(_x,_y) (((global DSTTYPE*)(Dst+DstOffset+(_y)*DstPitch))[_x])
//horizontal and vertical filter kernels
//should be defined on host during compile time to avoid overhead
__constant uint mat_kernelX[] = {KERNEL_MATRIX_X};
__constant uint mat_kernelY[] = {KERNEL_MATRIX_Y};
__kernel __attribute__((reqd_work_group_size(BLK_X,BLK_Y,1))) void sep_filter_singlepass
(
__global uchar* Src,
const uint SrcPitch,
const int srcOffsetX,
const int srcOffsetY,
__global uchar* Dst,
const int DstOffset,
const uint DstPitch,
int width,
int height,
int dstWidth,
int dstHeight
)
{
//RADIUSX, RADIUSY are filter dimensions
//BLK_X, BLK_Y are local wrogroup sizes
//all these should be defined on host during compile time
//first lsmem array for source pixels used in first pass,
//second lsmemDy for storing first pass results
__local WORKTYPE lsmem[BLK_Y+2*RADIUSY][BLK_X+2*RADIUSX];
__local WORKTYPE lsmemDy[BLK_Y][BLK_X+2*RADIUSX];
//get local and global ids - used as image and local memory array indexes
int lix = get_local_id(0);
int liy = get_local_id(1);
int x = (int)get_global_id(0);
int y = (int)get_global_id(1);
//calculate pixel position in source image taking image offset into account
int srcX = x + srcOffsetX - RADIUSX;
int srcY = y + srcOffsetY - RADIUSY;
int xb = srcX;
int yb = srcY;
//extrapolate coordinates, if needed
//and read my own source pixel into local memory
//with account for extra border pixels, which will be read by starting workitems
int clocY = liy;
int cSrcY = srcY;
do
{
int yb = cSrcY;
EXTRAPOLATE(yb, (height));
int clocX = lix;
int cSrcX = srcX;
do
{
int xb = cSrcX;
EXTRAPOLATE(xb,(width));
lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
clocX += BLK_X;
cSrcX += BLK_X;
}
while(clocX < BLK_X+(RADIUSX*2));
clocY += BLK_Y;
cSrcY += BLK_Y;
}
while(clocY < BLK_Y+(RADIUSY*2));
barrier(CLK_LOCAL_MEM_FENCE);
//do vertical filter pass
//and store intermediate results to second local memory array
int i;
WORKTYPE sum = 0.0f;
int clocX = lix;
do
{
sum = 0.0f;
for(i=0; i<=2*RADIUSY; i++)
sum = mad(lsmem[liy+i][clocX], as_float(mat_kernelY[i]), sum);
lsmemDy[liy][clocX] = sum;
clocX += BLK_X;
}
while(clocX < BLK_X+(RADIUSX*2));
barrier(CLK_LOCAL_MEM_FENCE);
//if this pixel happened to be out of image borders because of global size rounding,
//then just return
if( x >= dstWidth || y >=dstHeight ) return;
//do second horizontal filter pass
//and calculate final result
sum = 0.0f;
for(i=0; i<=2*RADIUSX; i++)
sum = mad(lsmemDy[liy][lix+i], as_float(mat_kernelX[i]), sum);
//store result into destination image
DST(x,y) = CONVERT_DSTTYPE(sum);
}

View File

@@ -126,13 +126,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
)
{
// this version used information provided for each workgroup
// no empty WG
int gid = (int)get_group_id(0);
int lid_x = (int)get_local_id(0);
int lid_y = (int)get_local_id(1);
int lid = lid_y*LSx+lid_x;
int4 WGInfo = pWGInfo[gid];
int4 WGInfo = pWGInfo[WGSTART+gid];
int GroupX = (WGInfo.y >> 16)&0xFFFF;
int GroupY = (WGInfo.y >> 0 )& 0xFFFF;
int Width = (WGInfo.x >> 16)&0xFFFF;
@@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int ImgOffset = WGInfo.z;
float ScaleFactor = as_float(WGInfo.w);
#define DATA_SIZE_X (LSx+WND_SIZE_X)
#define DATA_SIZE_Y (LSy+WND_SIZE_Y)
#define DATA_SIZE_X (PIXEL_STEP*LSx+WND_SIZE_X)
#define DATA_SIZE_Y (PIXEL_STEP*LSy+WND_SIZE_Y)
#define DATA_SIZE (DATA_SIZE_X*DATA_SIZE_Y)
local int SumL[DATA_SIZE];
@@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int4 info1 = p;
int4 info2 = pq;
{
int xl = lid_x;
int yl = lid_y;
// calc processed ROI coordinate in local mem
int xl = lid_x*PIXEL_STEP;
int yl = lid_y*PIXEL_STEP;
{// calc variance_norm_factor for all stages
int OffsetLocal = yl * DATA_SIZE_X + xl;
int OffsetGlobal = (GroupY+yl)* pixelstep + (GroupX+xl);
@@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int result = (1.0f>0.0f);
for(int stageloop = start_stage; (stageloop < end_stage) && result; stageloop++ )
{// iterate until candidate is exist
{// iterate until candidate is valid
float stage_sum = 0.0f;
__global GpuHidHaarStageClassifier* stageinfo = (__global GpuHidHaarStageClassifier*)
((__global uchar*)stagecascadeptr+stageloop*sizeof(GpuHidHaarStageClassifier));
int lcl_off = (yl*DATA_SIZE_X)+(xl);
int stagecount = stageinfo->count;
float stagethreshold = stageinfo->threshold;
int lcl_off = (lid_y*DATA_SIZE_X)+(lid_x);
for(int nodeloop = 0; nodeloop < stagecount; nodecounter++,nodeloop++ )
{
// simple macro to extract shorts from int
@@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int4 n1 = pN[1];
int4 n2 = pN[2];
float nodethreshold = as_float(n2.y) * variance_norm_factor;
// calc sum of intensity pixels according to node information
// calc sum of intensity pixels according to classifier node information
float classsum =
(SumL[M0(n0.x)+lcl_off] - SumL[M1(n0.x)+lcl_off] - SumL[M0(n0.y)+lcl_off] + SumL[M1(n0.y)+lcl_off]) * as_float(n1.z) +
(SumL[M0(n0.z)+lcl_off] - SumL[M1(n0.z)+lcl_off] - SumL[M0(n0.w)+lcl_off] + SumL[M1(n0.w)+lcl_off]) * as_float(n1.w) +
@@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int index = 1+atomic_inc((volatile global int*)candidate); //get index to write global data with face info
if(index<OUTPUTSZ)
{
int x = GroupX+lid_x;
int y = GroupY+lid_y;
int x = GroupX+xl;
int y = GroupY+yl;
int4 candidate_result;
candidate_result.x = convert_int_rtn(x*ScaleFactor);
candidate_result.y = convert_int_rtn(y*ScaleFactor);

View File

@@ -46,33 +46,26 @@
#ifndef WITH_MASK
#define WITH_MASK 0
#endif
__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
inline float ELEM_INT2(image2d_t _eig, int _x, int _y)
{
return read_imagef(_eig, sampler, (int2)(_x, _y)).x;
}
inline float ELEM_FLT2(image2d_t _eig, float2 pt)
{
return read_imagef(_eig, sampler, pt).x;
}
//macro to read eigenvalue matrix
#define GET_SRC_32F(_x, _y) ((__global const float*)(eig + (_y)*eig_pitch))[_x]
__kernel
void findCorners
(
image2d_t eig,
__global const char * mask,
__global float2 * corners,
const int mask_strip,// in pixels
const float threshold,
const int rows,
const int cols,
const int max_count,
__global int * g_counter
__global const char* eig,
const int eig_pitch,
__global const char* mask,
__global float2* corners,
const int mask_strip,// in pixels
__global const float* pMinMax,
const float qualityLevel,
const int rows,
const int cols,
const int max_count,
__global int* g_counter
)
{
float threshold = qualityLevel*pMinMax[1];
const int j = get_global_id(0);
const int i = get_global_id(1);
@@ -82,39 +75,42 @@ __kernel
#endif
)
{
const float val = ELEM_INT2(eig, j, i);
const float val = GET_SRC_32F(j, i);
if (val > threshold)
{
float maxVal = val;
maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal);
maxVal = fmax(GET_SRC_32F(j , i - 1), maxVal);
maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal);
maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal);
maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal);
maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal);
maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal);
maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal);
maxVal = fmax(GET_SRC_32F(j , i + 1), maxVal);
maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal);
if (val == maxVal)
{
const int ind = atomic_inc(g_counter);
if (ind < max_count)
corners[ind] = (float2)(j, i);
{// pack and store eigenvalue and its coordinates
corners[ind].x = val;
corners[ind].y = as_float(j|(i<<16));
}
}
}
}
}
#undef GET_SRC_32F
//bitonic sort
__kernel
void sortCorners_bitonicSort
(
image2d_t eig,
__global float2 * corners,
const int count,
const int stage,
@@ -140,8 +136,8 @@ __kernel
const float2 leftPt = corners[leftId];
const float2 rightPt = corners[rightId];
const float leftVal = ELEM_FLT2(eig, leftPt);
const float rightVal = ELEM_FLT2(eig, rightPt);
const float leftVal = leftPt.x;
const float rightVal = rightPt.x;
const bool compareResult = leftVal > rightVal;
@@ -152,124 +148,22 @@ __kernel
corners[rightId] = sortOrder ? greater : lesser;
}
//selection sort for gfft
//kernel is ported from Bolt library:
//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl
// Local sort will firstly sort elements of each workgroup using selection sort
// its performance is O(n)
__kernel
void sortCorners_selectionSortLocal
(
image2d_t eig,
__global float2 * corners,
const int count,
__local float2 * scratch
)
// this is simple short serial kernel that makes some short reduction and initialization work
// it makes HOST like work to avoid additional sync with HOST to do this short work
// data - input/output float2.
// input data are sevral (min,max) pairs
// output data is one reduced (min,max) pair
// g_counter - counter that have to be initialized by 0 for next findCorner call.
__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter)
{
int i = get_local_id(0); // index in workgroup
int numOfGroups = get_num_groups(0); // index in workgroup
int groupID = get_group_id(0);
int wg = get_local_size(0); // workgroup size = block size
int n; // number of elements to be processed for this work group
int offset = groupID * wg;
int same = 0;
corners += offset;
n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg;
float2 pt1, pt2;
pt1 = corners[min(i, n)];
scratch[i] = pt1;
barrier(CLK_LOCAL_MEM_FENCE);
if(i >= n)
g_counter[0] = 0;
float minVal = data[0];
float maxVal = data[groupnum];
for(int i=1;i<groupnum;++i)
{
return;
minVal = min(minVal,data[i]);
maxVal = max(maxVal,data[i+groupnum]);
}
float val1 = ELEM_FLT2(eig, pt1);
float val2;
int pos = 0;
for (int j=0;j<n;++j)
{
pt2 = scratch[j];
val2 = ELEM_FLT2(eig, pt2);
if(val2 > val1)
pos++;//calculate the rank of this element in this work group
else
{
if(val1 > val2)
continue;
else
{
// val1 and val2 are same
same++;
}
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}
__kernel
void sortCorners_selectionSortFinal
(
image2d_t eig,
__global float2 * corners,
const int count
)
{
const int i = get_local_id(0); // index in workgroup
const int numOfGroups = get_num_groups(0); // index in workgroup
const int groupID = get_group_id(0);
const int wg = get_local_size(0); // workgroup size = block size
int pos = 0, same = 0;
const int offset = get_group_id(0) * wg;
const int remainder = count - wg*(numOfGroups-1);
if((offset + i ) >= count)
return;
float2 pt1, pt2;
pt1 = corners[groupID*wg + i];
float val1 = ELEM_FLT2(eig, pt1);
float val2;
for(int j=0; j<numOfGroups-1; j++ )
{
for(int k=0; k<wg; k++)
{
pt2 = corners[j*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Increment only if the value is not the same.
if( val2 > val1 )
pos++;
else
same++;
}
}
}
for(int k=0; k<remainder; k++)
{
pt2 = corners[(numOfGroups-1)*wg + k];
val2 = ELEM_FLT2(eig, pt2);
if(val1 > val2)
break;
else
{
//Don't increment if the value is the same.
//Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false
if(val2 > val1)
pos++;
else
same++;
}
}
for (int j=0; j< same; j++)
corners[pos + j] = pt1;
}
data[0] = minVal;
data[1] = maxVal;
}

View File

@@ -1,45 +1,97 @@
///////////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////Macro for border type////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////
#ifdef BORDER_REPLICATE
//BORDER_REPLICATE: aaaaaa|abcdefgh|hhhhhhh
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (l_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (r_edge)-1 : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (t_edge) :(i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (b_edge)-1 :(addr))
#ifdef BORDER_CONSTANT
//CCCCCC|abcdefgh|CCCCCCC
#define EXTRAPOLATE(x, maxV)
#elif defined BORDER_REPLICATE
//aaaaaa|abcdefgh|hhhhhhh
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = max(min((x), (maxV) - 1), 0); \
}
#elif defined BORDER_WRAP
//cdefgh|abcdefgh|abcdefg
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = ( (x) + (maxV) ) % (maxV); \
}
#elif defined BORDER_REFLECT
//fedcba|abcdefgh|hgfedcb
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = min( mad24((maxV)-1,2,-(x))+1 , max((x),-(x)-1) ); \
}
#elif defined BORDER_REFLECT_101
//gfedcb|abcdefgh|gfedcba
#define EXTRAPOLATE(x, maxV) \
{ \
(x) = min( mad24((maxV)-1,2,-(x)), max((x),-(x)) ); \
}
#else
#error No extrapolation method
#endif
#ifdef BORDER_REFLECT
//BORDER_REFLECT: fedcba|abcdefgh|hgfedcb
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i)-1 : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-1+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i)-1 : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-1+((b_edge)<<1) : (addr))
#define SRC(_x,_y) convert_float(((global SRCTYPE*)(Src+(_y)*SrcPitch))[_x])
#ifdef BORDER_CONSTANT
//CCCCCC|abcdefgh|CCCCCCC
#define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
#else
#define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
#endif
#ifdef BORDER_REFLECT101
//BORDER_REFLECT101: gfedcb|abcdefgh|gfedcba
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? -(i) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? -(i)-2+((r_edge)<<1) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? -(i) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? -(i)-2+((b_edge)<<1) : (addr))
#endif
#define DSTX(_x,_y) (((global float*)(DstX+DstXOffset+(_y)*DstXPitch))[_x])
#define DSTY(_x,_y) (((global float*)(DstY+DstYOffset+(_y)*DstYPitch))[_x])
#ifdef BORDER_WRAP
//BORDER_WRAP: cdefgh|abcdefgh|abcdefg
#define ADDR_L(i, l_edge, r_edge) ((i) < (l_edge) ? (i)+(r_edge) : (i))
#define ADDR_R(i, r_edge, addr) ((i) >= (r_edge) ? (i)-(r_edge) : (addr))
#define ADDR_H(i, t_edge, b_edge) ((i) < (t_edge) ? (i)+(b_edge) : (i))
#define ADDR_B(i, b_edge, addr) ((i) >= (b_edge) ? (i)-(b_edge) : (addr))
#endif
#define INIT_AND_READ_LOCAL_SOURCE(width, height, fill_const, kernel_border) \
int srcX = x + srcOffsetX - (kernel_border); \
int srcY = y + srcOffsetY - (kernel_border); \
int xb = srcX; \
int yb = srcY; \
\
EXTRAPOLATE(xb, (width)); \
EXTRAPOLATE(yb, (height)); \
lsmem[liy][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \
\
if(lix < ((kernel_border)*2)) \
{ \
int xb = srcX+BLK_X; \
EXTRAPOLATE(xb,(width)); \
lsmem[liy][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \
} \
if(liy< ((kernel_border)*2)) \
{ \
int yb = srcY+BLK_Y; \
EXTRAPOLATE(yb, (height)); \
lsmem[liy+BLK_Y][lix] = ELEM(xb, yb, (width), (height), (fill_const) ); \
} \
if(lix<((kernel_border)*2) && liy<((kernel_border)*2)) \
{ \
int xb = srcX+BLK_X; \
int yb = srcY+BLK_Y; \
EXTRAPOLATE(xb,(width)); \
EXTRAPOLATE(yb,(height)); \
lsmem[liy+BLK_Y][lix+BLK_X] = ELEM(xb, yb, (width), (height), (fill_const) ); \
}
__kernel void sobel3(
__global uchar* Src,
__global float* DstX,
__global float* DstY,
int width, int height,
uint srcStride, uint dstStride,
float scale
const uint SrcPitch,
const int srcOffsetX,
const int srcOffsetY,
__global uchar* DstX,
const int DstXOffset,
const uint DstXPitch,
__global uchar* DstY,
const int DstYOffset,
const uint DstYPitch,
int width,
int height,
int dstWidth,
int dstHeight,
float scale
)
{
__local float lsmem[BLK_Y+2][BLK_X+2];
@@ -47,62 +99,249 @@ __kernel void sobel3(
int lix = get_local_id(0);
int liy = get_local_id(1);
int gix = get_group_id(0);
int giy = get_group_id(1);
int id_x = get_global_id(0);
int id_y = get_global_id(1);
lsmem[liy+1][lix+1] = convert_float(Src[ id_y * srcStride + id_x ]);
int id_y_h = ADDR_H(id_y-1, 0,height);
int id_y_b = ADDR_B(id_y+1, height,id_y+1);
int id_x_l = ADDR_L(id_x-1, 0,width);
int id_x_r = ADDR_R(id_x+1, width,id_x+1);
if(liy==0)
{
lsmem[0][lix+1]=convert_float(Src[ id_y_h * srcStride + id_x ]);
if(lix==0)
lsmem[0][0]=convert_float(Src[ id_y_h * srcStride + id_x_l ]);
else if(lix==BLK_X-1)
lsmem[0][BLK_X+1]=convert_float(Src[ id_y_h * srcStride + id_x_r ]);
}
else if(liy==BLK_Y-1)
{
lsmem[BLK_Y+1][lix+1]=convert_float(Src[ id_y_b * srcStride + id_x ]);
if(lix==0)
lsmem[BLK_Y+1][0]=convert_float(Src[ id_y_b * srcStride + id_x_l ]);
else if(lix==BLK_X-1)
lsmem[BLK_Y+1][BLK_X+1]=convert_float(Src[ id_y_b * srcStride + id_x_r ]);
}
if(lix==0)
lsmem[liy+1][0] = convert_float(Src[ id_y * srcStride + id_x_l ]);
else if(lix==BLK_X-1)
lsmem[liy+1][BLK_X+1] = convert_float(Src[ id_y * srcStride + id_x_r ]);
int x = (int)get_global_id(0);
int y = (int)get_global_id(1);
INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 1)
barrier(CLK_LOCAL_MEM_FENCE);
if( x >= dstWidth || y >=dstHeight ) return;
float u1 = lsmem[liy][lix];
float u2 = lsmem[liy][lix+1];
float u3 = lsmem[liy][lix+2];
float m1 = lsmem[liy+1][lix];
float m2 = lsmem[liy+1][lix+1];
float m3 = lsmem[liy+1][lix+2];
float b1 = lsmem[liy+2][lix];
float b2 = lsmem[liy+2][lix+1];
float b3 = lsmem[liy+2][lix+2];
//m2 * scale;//
float dx = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1 );
DstX[ id_y * dstStride + id_x ] = dx * scale;
//calc and store dx and dy;//
#ifdef SCHARR
DSTX(x,y) = mad(10.0f, m3 - m1, 3.0f * (u3 - u1 + b3 - b1)) * scale;
DSTY(x,y) = mad(10.0f, b2 - u2, 3.0f * (b1 - u1 + b3 - u3)) * scale;
#else
DSTX(x,y) = mad(2.0f, m3 - m1, u3 - u1 + b3 - b1) * scale;
DSTY(x,y) = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3) * scale;
#endif
}
float dy = mad(2.0f, b2 - u2, b1 - u1 + b3 - u3);
DstY[ id_y * dstStride + id_x ] = dy * scale;
}
__kernel void sobel5(
__global uchar* Src,
const uint SrcPitch,
const int srcOffsetX,
const int srcOffsetY,
__global uchar* DstX,
const int DstXOffset,
const uint DstXPitch,
__global uchar* DstY,
const int DstYOffset,
const uint DstYPitch,
int width,
int height,
int dstWidth,
int dstHeight,
float scale
)
{
__local float lsmem[BLK_Y+4][BLK_X+4];
int lix = get_local_id(0);
int liy = get_local_id(1);
int x = (int)get_global_id(0);
int y = (int)get_global_id(1);
INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 2)
barrier(CLK_LOCAL_MEM_FENCE);
if( x >= dstWidth || y >=dstHeight ) return;
float t1 = lsmem[liy][lix];
float t2 = lsmem[liy][lix+1];
float t3 = lsmem[liy][lix+2];
float t4 = lsmem[liy][lix+3];
float t5 = lsmem[liy][lix+4];
float u1 = lsmem[liy+1][lix];
float u2 = lsmem[liy+1][lix+1];
float u3 = lsmem[liy+1][lix+2];
float u4 = lsmem[liy+1][lix+3];
float u5 = lsmem[liy+1][lix+4];
float m1 = lsmem[liy+2][lix];
float m2 = lsmem[liy+2][lix+1];
float m4 = lsmem[liy+2][lix+3];
float m5 = lsmem[liy+2][lix+4];
float l1 = lsmem[liy+3][lix];
float l2 = lsmem[liy+3][lix+1];
float l3 = lsmem[liy+3][lix+2];
float l4 = lsmem[liy+3][lix+3];
float l5 = lsmem[liy+3][lix+4];
float b1 = lsmem[liy+4][lix];
float b2 = lsmem[liy+4][lix+1];
float b3 = lsmem[liy+4][lix+2];
float b4 = lsmem[liy+4][lix+3];
float b5 = lsmem[liy+4][lix+4];
//calc and store dx and dy;//
DSTX(x,y) = scale *
mad(12.0f, m4 - m2,
mad(6.0f, m5 - m1,
mad(8.0f, u4 - u2 + l4 - l2,
mad(4.0f, u5 - u1 + l5 - l1,
mad(2.0f, t4 - t2 + b4 - b2, t5 - t1 + b5 - b1 )
)
)
)
);
DSTY(x,y) = scale *
mad(12.0f, l3 - u3,
mad(6.0f, b3 - t3,
mad(8.0f, l2 - u2 + l4 - u4,
mad(4.0f, b2 - t2 + b4 - t4,
mad(2.0f, l1 - u1 + l5 - u5, b1 - t1 + b5 - t5 )
)
)
)
);
}
__kernel void sobel7(
__global uchar* Src,
const uint SrcPitch,
const int srcOffsetX,
const int srcOffsetY,
__global uchar* DstX,
const int DstXOffset,
const uint DstXPitch,
__global uchar* DstY,
const int DstYOffset,
const uint DstYPitch,
int width,
int height,
int dstWidth,
int dstHeight,
float scale
)
{
__local float lsmem[BLK_Y+6][BLK_X+6];
int lix = get_local_id(0);
int liy = get_local_id(1);
int x = (int)get_global_id(0);
int y = (int)get_global_id(1);
INIT_AND_READ_LOCAL_SOURCE(width, height, 0, 3)
barrier(CLK_LOCAL_MEM_FENCE);
if( x >= dstWidth || y >=dstHeight ) return;
float tt1 = lsmem[liy][lix];
float tt2 = lsmem[liy][lix+1];
float tt3 = lsmem[liy][lix+2];
float tt4 = lsmem[liy][lix+3];
float tt5 = lsmem[liy][lix+4];
float tt6 = lsmem[liy][lix+5];
float tt7 = lsmem[liy][lix+6];
float t1 = lsmem[liy+1][lix];
float t2 = lsmem[liy+1][lix+1];
float t3 = lsmem[liy+1][lix+2];
float t4 = lsmem[liy+1][lix+3];
float t5 = lsmem[liy+1][lix+4];
float t6 = lsmem[liy+1][lix+5];
float t7 = lsmem[liy+1][lix+6];
float u1 = lsmem[liy+2][lix];
float u2 = lsmem[liy+2][lix+1];
float u3 = lsmem[liy+2][lix+2];
float u4 = lsmem[liy+2][lix+3];
float u5 = lsmem[liy+2][lix+4];
float u6 = lsmem[liy+2][lix+5];
float u7 = lsmem[liy+2][lix+6];
float m1 = lsmem[liy+3][lix];
float m2 = lsmem[liy+3][lix+1];
float m3 = lsmem[liy+3][lix+2];
float m5 = lsmem[liy+3][lix+4];
float m6 = lsmem[liy+3][lix+5];
float m7 = lsmem[liy+3][lix+6];
float l1 = lsmem[liy+4][lix];
float l2 = lsmem[liy+4][lix+1];
float l3 = lsmem[liy+4][lix+2];
float l4 = lsmem[liy+4][lix+3];
float l5 = lsmem[liy+4][lix+4];
float l6 = lsmem[liy+4][lix+5];
float l7 = lsmem[liy+4][lix+6];
float b1 = lsmem[liy+5][lix];
float b2 = lsmem[liy+5][lix+1];
float b3 = lsmem[liy+5][lix+2];
float b4 = lsmem[liy+5][lix+3];
float b5 = lsmem[liy+5][lix+4];
float b6 = lsmem[liy+5][lix+5];
float b7 = lsmem[liy+5][lix+6];
float bb1 = lsmem[liy+6][lix];
float bb2 = lsmem[liy+6][lix+1];
float bb3 = lsmem[liy+6][lix+2];
float bb4 = lsmem[liy+6][lix+3];
float bb5 = lsmem[liy+6][lix+4];
float bb6 = lsmem[liy+6][lix+5];
float bb7 = lsmem[liy+6][lix+6];
//calc and store dx and dy
DSTX(x,y) = scale *
mad(100.0f, m5 - m3,
mad(80.0f, m6 - m2,
mad(20.0f, m7 - m1,
mad(75.0f, u5 - u3 + l5 - l3,
mad(60.0f, u6 - u2 + l6 - l2,
mad(15.0f, u7 - u1 + l7 - l1,
mad(30.0f, t5 - t3 + b5 - b3,
mad(24.0f, t6 - t2 + b6 - b2,
mad(6.0f, t7 - t1 + b7 - b1,
mad(5.0f, tt5 - tt3 + bb5 - bb3,
mad(4.0f, tt6 - tt2 + bb6 - bb2, tt7 - tt1 + bb7 - bb1 )
)
)
)
)
)
)
)
)
)
);
DSTY(x,y) = scale *
mad(100.0f, l4 - u4,
mad(80.0f, b4 - t4,
mad(20.0f, bb4 - tt4,
mad(75.0f, l5 - u5 + l3 - u3,
mad(60.0f, b5 - t5 + b3 - t3,
mad(15.0f, bb5 - tt5 + bb3 - tt3,
mad(30.0f, l6 - u6 + l2 - u2,
mad(24.0f, b6 - t6 + b2 - t2,
mad(6.0f, bb6 - tt6 + bb2 - tt2,
mad(5.0f, l7 - u7 + l1 - u1,
mad(4.0f, b7 - t7 + b1 - t1, bb7 - tt7 + bb1 - tt1 )
)
)
)
)
)
)
)
)
)
);
}

View File

@@ -50,6 +50,14 @@
#define NTHREADS 256
#define CV_PI_F 3.1415926535897932384626433832795f
#ifdef INTEL_DEVICE
#define QANGLE_TYPE int
#define QANGLE_TYPE2 int2
#else
#define QANGLE_TYPE uchar
#define QANGLE_TYPE2 uchar2
#endif
//----------------------------------------------------------------------------
// Histogram computation
// 12 threads for a cell, 12x4 threads per block
@@ -59,7 +67,7 @@ __kernel void compute_hists_lut_kernel(
const int cnbins, const int cblock_hist_size, const int img_block_width,
const int blocks_in_group, const int blocks_total,
const int grad_quadstep, const int qangle_step,
__global const float* grad, __global const uchar* qangle,
__global const float* grad, __global const QANGLE_TYPE* qangle,
__global const float* gauss_w_lut,
__global float* block_hists, __local float* smem)
{
@@ -86,7 +94,7 @@ __kernel void compute_hists_lut_kernel(
__global const float* grad_ptr = (gid < blocks_total) ?
grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
__global const uchar* qangle_ptr = (gid < blocks_total) ?
__global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
__local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
@@ -101,7 +109,7 @@ __kernel void compute_hists_lut_kernel(
for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
{
float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
uchar2 bin = (uchar2) (qangle_ptr[0], qangle_ptr[1]);
QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
grad_ptr += grad_quadstep;
qangle_ptr += qangle_step;
@@ -558,7 +566,7 @@ __kernel void extract_descrs_by_cols_kernel(
__kernel void compute_gradients_8UC4_kernel(
const int height, const int width,
const int img_step, const int grad_quadstep, const int qangle_step,
const __global uchar4 * img, __global float * grad, __global uchar * qangle,
const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);
@@ -660,7 +668,7 @@ __kernel void compute_gradients_8UC4_kernel(
__kernel void compute_gradients_8UC1_kernel(
const int height, const int width,
const int img_step, const int grad_quadstep, const int qangle_step,
__global const uchar * img, __global float * grad, __global uchar * qangle,
__global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
const float angle_scale, const char correct_gamma, const int cnbins)
{
const int x = get_global_id(0);

View File

@@ -116,7 +116,7 @@ Mat randomMat(RNG& rng, Size size, int type, double minVal, double maxVal, bool
Mat m(size0, type);
rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal));
rng.fill(m, RNG::UNIFORM, minVal, maxVal);
if( size0 == size )
return m;
return m(Rect((size0.width-size.width)/2, (size0.height-size.height)/2, size.width, size.height));
@@ -142,7 +142,7 @@ Mat randomMat(RNG& rng, const vector<int>& size, int type, double minVal, double
Mat m(dims, &size0[0], type);
rng.fill(m, RNG::UNIFORM, Scalar::all(minVal), Scalar::all(maxVal));
rng.fill(m, RNG::UNIFORM, minVal, maxVal);
if( eqsize )
return m;
return m(&r[0]);