Fix retina ocl on NV.
Previously we incorrectly assumed data was aligned in 16 bytes, which should be 32.
This commit is contained in:
parent
e45f92a9e1
commit
a92721b569
@ -43,6 +43,9 @@
|
|||||||
//
|
//
|
||||||
//M*/
|
//M*/
|
||||||
|
|
||||||
|
//data (which is float) is aligend in 32 bytes
|
||||||
|
#define WIDTH_MULTIPLE (32 >> 2)
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////
|
/////////////////////////////////////////////////////////
|
||||||
//*******************************************************
|
//*******************************************************
|
||||||
// basicretinafilter
|
// basicretinafilter
|
||||||
@ -116,22 +119,18 @@ kernel void horizontalAnticausalFilter(
|
|||||||
|
|
||||||
float4 result_v4 = (float4)(0), out_v4;
|
float4 result_v4 = (float4)(0), out_v4;
|
||||||
float result = 0;
|
float result = 0;
|
||||||
// we assume elements_per_row is multple of 4
|
// we assume elements_per_row is multple of WIDTH_MULTIPLE
|
||||||
for(int i = 0; i < 4; ++ i, -- optr)
|
for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr)
|
||||||
{
|
{
|
||||||
if(i < elements_per_row - cols)
|
if(i >= elements_per_row - cols)
|
||||||
{
|
|
||||||
*optr = result;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
result = *optr + _a * result;
|
result = *optr + _a * result;
|
||||||
*optr = result;
|
|
||||||
}
|
}
|
||||||
|
*optr = result;
|
||||||
}
|
}
|
||||||
result_v4.x = result;
|
result_v4.x = result;
|
||||||
optr -= 3;
|
optr -= 3;
|
||||||
for(int i = 1; i < elements_per_row / 4; ++i, optr -= 4)
|
for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4)
|
||||||
{
|
{
|
||||||
// shift left, `offset` is type `size_t` so it cannot be negative
|
// shift left, `offset` is type `size_t` so it cannot be negative
|
||||||
out_v4 = vload4(0, optr);
|
out_v4 = vload4(0, optr);
|
||||||
@ -223,23 +222,19 @@ kernel void horizontalAnticausalFilter_Irregular(
|
|||||||
|
|
||||||
float4 buf_v4, out_v4, res_v4 = (float4)(0);
|
float4 buf_v4, out_v4, res_v4 = (float4)(0);
|
||||||
float result = 0;
|
float result = 0;
|
||||||
// we assume elements_per_row is multple of 4
|
// we assume elements_per_row is multple of WIDTH_MULTIPLE
|
||||||
for(int i = 0; i < 4; ++ i, -- optr, -- bptr)
|
for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr, -- bptr)
|
||||||
{
|
{
|
||||||
if(i < elements_per_row - cols)
|
if(i >= elements_per_row - cols)
|
||||||
{
|
|
||||||
*optr = result;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
{
|
||||||
result = *optr + *bptr * result;
|
result = *optr + *bptr * result;
|
||||||
*optr = result;
|
|
||||||
}
|
}
|
||||||
|
*optr = result;
|
||||||
}
|
}
|
||||||
res_v4.x = result;
|
res_v4.x = result;
|
||||||
optr -= 3;
|
optr -= 3;
|
||||||
bptr -= 3;
|
bptr -= 3;
|
||||||
for(int i = 0; i < elements_per_row / 4 - 1; ++i, optr -= 4, bptr -= 4)
|
for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4, bptr -= 4)
|
||||||
{
|
{
|
||||||
buf_v4 = vload4(0, bptr);
|
buf_v4 = vload4(0, bptr);
|
||||||
out_v4 = vload4(0, optr);
|
out_v4 = vload4(0, optr);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user