optimized cv::UMat::dot
This commit is contained in:
@@ -158,10 +158,59 @@
|
||||
#define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset))
|
||||
#endif
|
||||
|
||||
#if kercn == 1
|
||||
#define REDUCE_GLOBAL \
|
||||
SRC2_INDEX; \
|
||||
dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||
FUNC(accumulator, temp, temp2)
|
||||
#elif kercn == 2
|
||||
#define REDUCE_GLOBAL \
|
||||
SRC2_INDEX; \
|
||||
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||
FUNC(accumulator, temp.s1, temp2.s1)
|
||||
#elif kercn == 4
|
||||
#define REDUCE_GLOBAL \
|
||||
SRC2_INDEX; \
|
||||
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||
FUNC(accumulator, temp.s1, temp2.s1); \
|
||||
FUNC(accumulator, temp.s2, temp2.s2); \
|
||||
FUNC(accumulator, temp.s3, temp2.s3)
|
||||
#elif kercn == 8
|
||||
#define REDUCE_GLOBAL \
|
||||
SRC2_INDEX; \
|
||||
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||
FUNC(accumulator, temp.s1, temp2.s1); \
|
||||
FUNC(accumulator, temp.s2, temp2.s2); \
|
||||
FUNC(accumulator, temp.s3, temp2.s3); \
|
||||
FUNC(accumulator, temp.s4, temp2.s4); \
|
||||
FUNC(accumulator, temp.s5, temp2.s5); \
|
||||
FUNC(accumulator, temp.s6, temp2.s6); \
|
||||
FUNC(accumulator, temp.s7, temp2.s7)
|
||||
#elif kercn == 16
|
||||
#define REDUCE_GLOBAL \
|
||||
SRC2_INDEX; \
|
||||
dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
|
||||
FUNC(accumulator, temp.s0, temp2.s0); \
|
||||
FUNC(accumulator, temp.s1, temp2.s1); \
|
||||
FUNC(accumulator, temp.s2, temp2.s2); \
|
||||
FUNC(accumulator, temp.s3, temp2.s3); \
|
||||
FUNC(accumulator, temp.s4, temp2.s4); \
|
||||
FUNC(accumulator, temp.s5, temp2.s5); \
|
||||
FUNC(accumulator, temp.s6, temp2.s6); \
|
||||
FUNC(accumulator, temp.s7, temp2.s7); \
|
||||
FUNC(accumulator, temp.s8, temp2.s8); \
|
||||
FUNC(accumulator, temp.s9, temp2.s9); \
|
||||
FUNC(accumulator, temp.sA, temp2.sA); \
|
||||
FUNC(accumulator, temp.sB, temp2.sB); \
|
||||
FUNC(accumulator, temp.sC, temp2.sC); \
|
||||
FUNC(accumulator, temp.sD, temp2.sD); \
|
||||
FUNC(accumulator, temp.sE, temp2.sE); \
|
||||
FUNC(accumulator, temp.sF, temp2.sF)
|
||||
#endif
|
||||
|
||||
#else
|
||||
#if kercn == 1
|
||||
#define REDUCE_GLOBAL \
|
||||
|
||||
Reference in New Issue
Block a user