*applied patch from NVidia (nppstTraspose bug)

*fixed some warnings
*finished gpu test port to gtest framework
This commit is contained in:
Anatoly Baksheev
2011-02-18 12:23:18 +00:00
parent 916690a674
commit 047c7e0fd6
12 changed files with 69 additions and 47 deletions

View File

@@ -1198,26 +1198,32 @@ __global__ void transpose(T *d_src, Ncv32u srcStride,
Ncv32u xIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.x;
Ncv32u yIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.y;
Ncv32u index_in = xIndex + yIndex * srcStride;
Ncv32u index_gmem = xIndex + yIndex * srcStride;
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
Ncv32u index_out = xIndex + yIndex * dstStride;
for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
if (xIndex < srcRoi.width)
{
tile[threadIdx.y+i][threadIdx.x] = d_src[index_in+i*srcStride];
for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < srcRoi.height)
{
tile[threadIdx.y+i][threadIdx.x] = d_src[index_gmem+i*srcStride];
}
}
}
__syncthreads();
xIndex = blockIdx_y * TRANSPOSE_TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TRANSPOSE_TILE_DIM + threadIdx.y;
index_gmem = xIndex + yIndex * dstStride;
if (xIndex < srcRoi.height)
{
for (Ncv32u i=0; i<TRANSPOSE_TILE_DIM; i+=TRANSPOSE_BLOCK_ROWS)
{
if (yIndex + i < srcRoi.width)
{
d_dst[index_out+i*dstStride] = tile[threadIdx.x][threadIdx.y+i];
d_dst[index_gmem+i*dstStride] = tile[threadIdx.x][threadIdx.y+i];
}
}
}