Fix retina ocl on NV.

Previously we incorrectly assumed data was aligned in 16 bytes, which should be 32.
This commit is contained in:
pengxiao 2013-09-10 14:33:23 +08:00
parent e45f92a9e1
commit a92721b569

View File

@ -43,6 +43,9 @@
// //
//M*/ //M*/
//data (which is float) is aligend in 32 bytes
#define WIDTH_MULTIPLE (32 >> 2)
///////////////////////////////////////////////////////// /////////////////////////////////////////////////////////
//******************************************************* //*******************************************************
// basicretinafilter // basicretinafilter
@ -116,22 +119,18 @@ kernel void horizontalAnticausalFilter(
float4 result_v4 = (float4)(0), out_v4; float4 result_v4 = (float4)(0), out_v4;
float result = 0; float result = 0;
// we assume elements_per_row is multple of 4 // we assume elements_per_row is multple of WIDTH_MULTIPLE
for(int i = 0; i < 4; ++ i, -- optr) for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr)
{ {
if(i < elements_per_row - cols) if(i >= elements_per_row - cols)
{
*optr = result;
}
else
{ {
result = *optr + _a * result; result = *optr + _a * result;
*optr = result;
} }
*optr = result;
} }
result_v4.x = result; result_v4.x = result;
optr -= 3; optr -= 3;
for(int i = 1; i < elements_per_row / 4; ++i, optr -= 4) for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4)
{ {
// shift left, `offset` is type `size_t` so it cannot be negative // shift left, `offset` is type `size_t` so it cannot be negative
out_v4 = vload4(0, optr); out_v4 = vload4(0, optr);
@ -223,23 +222,19 @@ kernel void horizontalAnticausalFilter_Irregular(
float4 buf_v4, out_v4, res_v4 = (float4)(0); float4 buf_v4, out_v4, res_v4 = (float4)(0);
float result = 0; float result = 0;
// we assume elements_per_row is multple of 4 // we assume elements_per_row is multple of WIDTH_MULTIPLE
for(int i = 0; i < 4; ++ i, -- optr, -- bptr) for(int i = 0; i < WIDTH_MULTIPLE; ++ i, -- optr, -- bptr)
{ {
if(i < elements_per_row - cols) if(i >= elements_per_row - cols)
{
*optr = result;
}
else
{ {
result = *optr + *bptr * result; result = *optr + *bptr * result;
*optr = result;
} }
*optr = result;
} }
res_v4.x = result; res_v4.x = result;
optr -= 3; optr -= 3;
bptr -= 3; bptr -= 3;
for(int i = 0; i < elements_per_row / 4 - 1; ++i, optr -= 4, bptr -= 4) for(int i = WIDTH_MULTIPLE / 4; i < elements_per_row / 4; ++i, optr -= 4, bptr -= 4)
{ {
buf_v4 = vload4(0, bptr); buf_v4 = vload4(0, bptr);
out_v4 = vload4(0, optr); out_v4 = vload4(0, optr);