Fast deep-copy-transpose implemented which attempts to not thrash the cache. Good first step for now, but no transpose at all would be preferrable. Started I/O.

This commit is contained in:
hbristow 2013-07-12 10:38:48 +10:00
parent d126263983
commit 346f7d0f3e
6 changed files with 261 additions and 34 deletions

View File

@ -118,7 +118,7 @@ if (NOT MEX_WORKS)
${CMAKE_CURRENT_SOURCE_DIR}/test/test_compiler.cpp
WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/junk
ERROR_VARIABLE MEX_ERROR
#OUTPUT_QUIET
OUTPUT_QUIET
)
if (MEX_ERROR)

View File

@ -2,6 +2,7 @@
#define OPENCV_MXARRAY_HPP_
#include "mex.h"
#include "transpose.hpp"
#include <vector>
#include <string>
#include <opencv2/core.hpp>
@ -496,39 +497,6 @@ cv::Mat MxArray::toMat<Matlab::InheritType>() const {
// MATRIX TRANSPOSE
// ----------------------------------------------------------------------------
template <typename InputScalar, typename OutputScalar>
void gemt(const char major, const size_t M, const size_t N, const InputScalar* a, size_t lda, OutputScalar* b, size_t ldb) {
switch (major) {
case 'R':
for (size_t m = 0; m < M; ++m) {
InputScalar const * arow = a + m*lda;
InputScalar const * const aend = arow + N;
OutputScalar * bcol = b + m;
while (arow < aend) {
*bcol = *arow;
arow++;
bcol+=ldb;
}
}
return;
case 'C':
for (size_t n = 0; n < N; ++n) {
InputScalar const * acol = a + n*lda;
InputScalar const * const aend = acol + M;
OutputScalar * brow = b + n;
while (acol < aend) {
*brow = *acol;
acol++;
brow+=ldb;
}
}
return;
default:
error(std::string("Unknown ordering given: ").append(std::string(1,major)));
}
}
template <typename InputScalar, typename OutputScalar>
void deepCopyAndTranspose(const cv::Mat& in, MxArray& out) {

View File

@ -0,0 +1,98 @@
#ifndef OPENCV_TRANSPOSE_HPP_
#define OPENCV_TRANSPOSE_HPP_
template <typename InputScalar, typename OutputScalar>
void transposeBlock(const size_t M, const size_t N, const InputScalar* src, size_t lda, OutputScalar* dst, size_t ldb) {
InputScalar cache[16];
// copy the source into the cache contiguously
for (size_t n = 0; n < N; ++n)
for (size_t m = 0; m < M; ++m)
cache[m+n*4] = src[m+n*lda];
// copy the destination out of the cache contiguously
for (size_t m = 0; m < M; ++m)
for (size_t n = 0; n < N; ++n)
dst[n+m*ldb] = cache[m+n*4];
}
template <typename InputScalar, typename OutputScalar>
void transpose4x4(const InputScalar* src, size_t lda, OutputScalar* dst, size_t ldb) {
InputScalar cache[16];
// copy the source into the cache contiguously
cache[0] = src[0]; cache[1] = src[1]; cache[2] = src[2]; cache[3] = src[3]; src+=lda;
cache[4] = src[0]; cache[5] = src[1]; cache[6] = src[2]; cache[7] = src[3]; src+=lda;
cache[8] = src[0]; cache[9] = src[1]; cache[10] = src[2]; cache[11] = src[3]; src+=lda;
cache[12] = src[0]; cache[13] = src[1]; cache[14] = src[2]; cache[15] = src[3]; src+=lda;
// copy the destination out of the contiguously
dst[0] = cache[0]; dst[1] = cache[4]; dst[2] = cache[8]; dst[3] = cache[12]; dst+=ldb;
dst[0] = cache[1]; dst[1] = cache[5]; dst[2] = cache[9]; dst[3] = cache[13]; dst+=ldb;
dst[0] = cache[2]; dst[1] = cache[6]; dst[2] = cache[10]; dst[3] = cache[14]; dst+=ldb;
dst[0] = cache[3]; dst[1] = cache[7]; dst[2] = cache[11]; dst[3] = cache[15]; dst+=ldb;
}
/*
* Vanilla copy, transpose and cast
*/
template <typename InputScalar, typename OutputScalar>
void gemt(const char major, const size_t M, const size_t N, const InputScalar* a, size_t lda, OutputScalar* b, size_t ldb) {
// 1x1 transpose is just copy
if (M == 1 && N == 1) { *b = *a; return; }
// get the interior 4x4 blocks, and the extra skirting
const size_t Fblock = (major == 'R') ? N/4 : M/4;
const size_t Frem = (major == 'R') ? N%4 : M%4;
const size_t Sblock = (major == 'R') ? M/4 : N/4;
const size_t Srem = (major == 'R') ? M%4 : N%4;
// if less than 4x4, invoke the block transpose immediately
if (M < 4 && N < 4) { transposeBlock(Frem, Srem, a, lda, b, ldb); return; }
// transpose 4x4 blocks
const InputScalar* aptr = a;
OutputScalar* bptr = b;
for (size_t second = 0; second < Sblock; ++second) {
aptr = a + second*lda;
bptr = b + second;
for (size_t first = 0; first < Fblock; ++first) {
transposeBlock(4, 4, aptr, lda, bptr, ldb);
//transpose4x4(aptr, lda, bptr, ldb);
aptr+=4;
bptr+=4*ldb;
}
// transpose trailing blocks on primary dimension
transposeBlock(Frem, 4, aptr, lda, bptr, ldb);
}
// transpose trailing blocks on secondary dimension
aptr = a + 4*Sblock*lda;
bptr = b + 4*Sblock;
for (size_t first = 0; first < Fblock; ++first) {
transposeBlock(4, Srem, aptr, lda, bptr, ldb);
aptr+=4;
bptr+=4*ldb;
}
// transpose bottom right-hand corner
transposeBlock(Frem, Srem, aptr, lda, bptr, ldb);
}
#ifdef __SSE2__
/*
* SSE2 supported fast copy, transpose and cast
*/
#include <emmintrin.h>
template <>
void transpose4x4<float, float>(const float* src, size_t lda, float* dst, size_t ldb) {
__m128 row0, row1, row2, row3;
row0 = _mm_loadu_ps(src);
row1 = _mm_loadu_ps(src+lda);
row2 = _mm_loadu_ps(src+2*lda);
row3 = _mm_loadu_ps(src+3*lda);
_MM_TRANSPOSE4_PS(row0, row1, row2, row3);
_mm_storeu_ps(dst, row0);
_mm_storeu_ps(dst+ldb, row1);
_mm_storeu_ps(dst+2*ldb, row2);
_mm_storeu_ps(dst+3*ldb, row3);
}
#endif
#endif

View File

@ -0,0 +1,48 @@
#ifndef OPENCV_FILEBUFFER_HPP_
#define OPENCV_FILEBUFFER_HPP_
#include <vector>
#include <streambuf>
class EndianFileBuffer : public std::streambuf {
private:
const int fd_;
const size_t put_back_;
std::vector<char> buffer_;
// prevent copy construction
EndianFileBuffer(const EndianFileBuffer&);
EndianFileBuffer& operator=(const EndianFileBuffer&);
public:
explicit EndianFileBuffer(int fd, size_t buffer_sz, size_t put_back) :
fd_(fd), put_back_(max(put_back, 1)), buffer_(max(buffer_sz, put_back_) + put_back_) {
char *end = &buffer_.front() + buffer_.size();
setg(end, end, end);
}
std::streambuf::int_type underflow() {
if (gptr() < egptr()) // buffer not exhausted
return traits_type::to_int_type(*gptr());
char *base = &buffer_.front();
char *start = base;
if (eback() == base) { // true when this isn't the first fill
std::memmove(base, egptr() - put_back_, put_back_);
start += put_back_;
}
// start is now the start of the buffer
// refill from the file
read(fd_, start, buffer_.size() - (start - base));
if (n == 0) return traits_type::eof();
// set buffer pointers
setg(base, start, start + n);
return traits_type::to_int_type(*gptr());
}
};
#endif

View File

@ -0,0 +1,32 @@
#include <ctime>
#include <stringstream>
const char* day[] = { "Sun", "Mon", "Tue", "Wed", "Thurs", "Fri", "Sat" };
const char* month[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
const char* arch = "${MEX_ARCH}"
std::string formatCurrentTime() {
ostringstream oss;
time_t rawtime;
struct tm* timeinfo;
int dom, hour, min, sec, year;
// compute the current time
time(&rawtime);
timeinfo = localtime(&rawtime);
// extract the components of interest
dom = timeinfo->tm_mday;
hour = timeinfo->tm_hour;
min = timeinfo->tm_min;
sec = timeinfo->tm_sec;
year = timeinfo->year + 1900;
oss << day[timeinfo->tm_wday] << " " << month[timeinfo->tm_mon]
<< " " << dom << " " << hour << ":" << min << ":" << sec << " " << year;
return oss.str();
}
void MatlabIO::whos() {
std::cout << "-------------------- whos --------------------" << std::endl;
std::cout << "Filename: " << filename() << std::endl;
std::cout << "File size: " << filesize() << "MB" << std::endl << std::endl;
std::cout << "Name size bytes type" << std::endl;
std::cout << "----------------------------------------------" << std::endl;

View File

@ -0,0 +1,81 @@
#ifndef MATLAB_IO_HPP_
#define MATLAB_IO_HPP_
#include <opencv2/core.hpp>
#include "map.hpp"
namespace Matlab {
namespace IO {
static const int VERSION_5 = 5;
static const int VERSION_73 = 73;
}
class Index {
private:
//! the name of the field (if associative container)
std::string name_;
//! beginning of the data field in the file
size_t begin_;
//! address after the last data field
size_t end_;
//! Matlab stored-type
int stored_type_;
//! Matlab actual type (sometimes compression is used)
int type_;
//! is the field compressed?
bool compressed_;
//! are the descendents associative (mappings)
bool associative_;
//! the descendents of this node
union {
//! valid if the container is a sequence (list)
std::vector<Index> sequence_;
//! valid if the container is a mapping (associative)
Map<std::string, Index> mapping_;
};
};
class MatlabIONode {
};
class MatlabIO {
private:
// member variables
static const int HEADER_LENGTH = 116;
static const int SUBSYS_LENGTH = 8;
static const int ENDIAN_LENGTH = 2;
char header_[HEADER_LENGTH+1];
char subsys_[SUBSYS_LENGTH+1];
char endian_[ENDIAN_LENGTH+1];
int version_;
bool byte_swap_;
std::string filename_;
// uses a custom stream buffer for fast memory-mapped access and endian swapping
std::fstream stream_;
//! the main file index. The top-level index must be associative
Index index_;
// internal methods
void getFileHeader();
void setFileHeader();
void getHeader();
void setHeader();
public:
// construct/destruct
MatlabIO() {}
~MatlabIO {}
// global read and write routines
std::string filename(void);
bool open(const std::string& filename, const std::string& mode);
// index the contents of the file
void index();
// print all of the top-level variables in the file
}
#endif