diff --git a/modules/ocl/include/opencv2/ocl/ocl.hpp b/modules/ocl/include/opencv2/ocl/ocl.hpp index d144a042e..ff35d14e7 100644 --- a/modules/ocl/include/opencv2/ocl/ocl.hpp +++ b/modules/ocl/include/opencv2/ocl/ocl.hpp @@ -1380,8 +1380,10 @@ namespace cv oclMat Dx_; oclMat Dy_; oclMat eig_; + oclMat eig_minmax_; oclMat minMaxbuf_; oclMat tmpCorners_; + oclMat counter_; }; inline GoodFeaturesToTrackDetector_OCL::GoodFeaturesToTrackDetector_OCL(int maxCorners_, double qualityLevel_, double minDistance_, diff --git a/modules/ocl/src/gftt.cpp b/modules/ocl/src/gftt.cpp index 541b1d6ef..a82196d78 100644 --- a/modules/ocl/src/gftt.cpp +++ b/modules/ocl/src/gftt.cpp @@ -48,154 +48,142 @@ using namespace cv; using namespace cv::ocl; +// currently sort procedure on the host is more efficient static bool use_cpu_sorter = true; -namespace +// compact structure for corners +struct DefCorner { -enum SortMethod + float eig; //eigenvalue of corner + short x; //x coordinate of corner point + short y; //y coordinate of corner point +} ; + +// compare procedure for corner +//it is used for sort on the host side +struct DefCornerCompare { - CPU_STL, - BITONIC, - SELECTION -}; - -const int GROUP_SIZE = 256; - -template -struct Sorter -{ - //typedef EigType; -}; - -//TODO(pengx): optimize GPU sorter's performance thus CPU sorter is removed. -template<> -struct Sorter -{ - typedef oclMat EigType; - static cv::Mutex cs; - static Mat mat_eig; - - //prototype - static int clfloat2Gt(cl_float2 pt1, cl_float2 pt2) + bool operator()(const DefCorner a, const DefCorner b) const { - float v1 = mat_eig.at(cvRound(pt1.s[1]), cvRound(pt1.s[0])); - float v2 = mat_eig.at(cvRound(pt2.s[1]), cvRound(pt2.s[0])); - return v1 > v2; - } - static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count) - { - cv::AutoLock lock(cs); - //temporarily use STL's sort function - Mat mat_corners = corners; - mat_eig = eig_tex; - std::sort(mat_corners.begin(), mat_corners.begin() + count, clfloat2Gt); - corners = mat_corners; + return a.eig > b.eig; } }; -cv::Mutex Sorter::cs; -cv::Mat Sorter::mat_eig; -template<> -struct Sorter +// sort corner point using opencl bitonicosrt implementation +static void sortCorners_caller(oclMat& corners, const int count) { - typedef TextureCL EigType; + Context * cxt = Context::getContext(); + int GS = count/2; + int LS = min(255,GS); + size_t globalThreads[3] = {GS, 1, 1}; + size_t localThreads[3] = {LS, 1, 1}; - static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count) + // 2^numStages should be equal to count or the output is invalid + int numStages = 0; + for(int i = count; i > 1; i >>= 1) { - Context * cxt = Context::getContext(); - size_t globalThreads[3] = {count / 2, 1, 1}; - size_t localThreads[3] = {GROUP_SIZE, 1, 1}; - - // 2^numStages should be equal to count or the output is invalid - int numStages = 0; - for(int i = count; i > 1; i >>= 1) + ++numStages; + } + const int argc = 4; + std::vector< std::pair > args(argc); + std::string kernelname = "sortCorners_bitonicSort"; + args[0] = std::make_pair(sizeof(cl_mem), (void *)&corners.data); + args[1] = std::make_pair(sizeof(cl_int), (void *)&count); + for(int stage = 0; stage < numStages; ++stage) + { + args[2] = std::make_pair(sizeof(cl_int), (void *)&stage); + for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) { - ++numStages; - } - const int argc = 5; - std::vector< std::pair > args(argc); - std::string kernelname = "sortCorners_bitonicSort"; - args[0] = std::make_pair(sizeof(cl_mem), (void *)&eig_tex); - args[1] = std::make_pair(sizeof(cl_mem), (void *)&corners.data); - args[2] = std::make_pair(sizeof(cl_int), (void *)&count); - for(int stage = 0; stage < numStages; ++stage) - { - args[3] = std::make_pair(sizeof(cl_int), (void *)&stage); - for(int passOfStage = 0; passOfStage < stage + 1; ++passOfStage) - { - args[4] = std::make_pair(sizeof(cl_int), (void *)&passOfStage); - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); - } + args[3] = std::make_pair(sizeof(cl_int), (void *)&passOfStage); + openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); } } -}; +} -template<> -struct Sorter -{ - typedef TextureCL EigType; - - static void sortCorners_caller(const EigType& eig_tex, oclMat& corners, const int count) - { - Context * cxt = Context::getContext(); - - size_t globalThreads[3] = {count, 1, 1}; - size_t localThreads[3] = {GROUP_SIZE, 1, 1}; - - std::vector< std::pair > args; - //local - std::string kernelname = "sortCorners_selectionSortLocal"; - int lds_size = GROUP_SIZE * sizeof(cl_float2); - args.push_back( std::make_pair( sizeof(cl_mem), (void*)&eig_tex) ); - args.push_back( std::make_pair( sizeof(cl_mem), (void*)&corners.data) ); - args.push_back( std::make_pair( sizeof(cl_int), (void*)&count) ); - args.push_back( std::make_pair( lds_size, (void*)NULL) ); - - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); - - //final - kernelname = "sortCorners_selectionSortFinal"; - args.pop_back(); - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1); - } -}; - -int findCorners_caller( - const TextureCL& eig, - const float threshold, - const oclMat& mask, - oclMat& corners, - const int max_count) +// find corners on matrix and put it into array +static void findCorners_caller( + const oclMat& eig_mat, //input matrix worth eigenvalues + oclMat& eigMinMax, //input with min and max values of eigenvalues + const float qualityLevel, + const oclMat& mask, + oclMat& corners, //output array with detected corners + oclMat& counter) //output value with number of detected corners, have to be 0 before call { + string opt; std::vector k; Context * cxt = Context::getContext(); std::vector< std::pair > args; - std::string kernelname = "findCorners"; const int mask_strip = mask.step / mask.elemSize1(); - oclMat g_counter(1, 1, CV_32SC1); - g_counter.setTo(0); + args.push_back(make_pair( sizeof(cl_mem), (void*)&(eig_mat.data))); - args.push_back(make_pair( sizeof(cl_mem), (void*)&eig )); + int src_pitch = (int)eig_mat.step; + args.push_back(make_pair( sizeof(cl_int), (void*)&src_pitch )); args.push_back(make_pair( sizeof(cl_mem), (void*)&mask.data )); args.push_back(make_pair( sizeof(cl_mem), (void*)&corners.data )); args.push_back(make_pair( sizeof(cl_int), (void*)&mask_strip)); - args.push_back(make_pair( sizeof(cl_float), (void*)&threshold )); - args.push_back(make_pair( sizeof(cl_int), (void*)&eig.rows )); - args.push_back(make_pair( sizeof(cl_int), (void*)&eig.cols )); - args.push_back(make_pair( sizeof(cl_int), (void*)&max_count )); - args.push_back(make_pair( sizeof(cl_mem), (void*)&g_counter.data )); + args.push_back(make_pair( sizeof(cl_mem), (void*)&eigMinMax.data )); + args.push_back(make_pair( sizeof(cl_float), (void*)&qualityLevel )); + args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.rows )); + args.push_back(make_pair( sizeof(cl_int), (void*)&eig_mat.cols )); + args.push_back(make_pair( sizeof(cl_int), (void*)&corners.cols )); + args.push_back(make_pair( sizeof(cl_mem), (void*)&counter.data )); - size_t globalThreads[3] = {eig.cols, eig.rows, 1}; + size_t globalThreads[3] = {eig_mat.cols, eig_mat.rows, 1}; size_t localThreads[3] = {16, 16, 1}; + if(!mask.empty()) + opt += " -D WITH_MASK=1"; - const char * opt = mask.empty() ? "" : "-D WITH_MASK"; - openCLExecuteKernel(cxt, &imgproc_gftt, kernelname, globalThreads, localThreads, args, -1, -1, opt); - return std::min(Mat(g_counter).at(0), max_count); + openCLExecuteKernel(cxt, &imgproc_gftt, "findCorners", globalThreads, localThreads, args, -1, -1, opt.c_str()); +} + + +static void minMaxEig_caller(const oclMat &src, oclMat &dst, oclMat & tozero) +{ + size_t groupnum = src.clCxt->getDeviceInfo().maxComputeUnits; + CV_Assert(groupnum != 0); + + int dbsize = groupnum * 2 * src.elemSize(); + + ensureSizeIsEnough(1, dbsize, CV_8UC1, dst); + + cl_mem dst_data = reinterpret_cast(dst.data); + + int all_cols = src.step / src.elemSize(); + int pre_cols = (src.offset % src.step) / src.elemSize(); + int sec_cols = all_cols - (src.offset % src.step + src.cols * src.elemSize() - 1) / src.elemSize() - 1; + int invalid_cols = pre_cols + sec_cols; + int cols = all_cols - invalid_cols , elemnum = cols * src.rows; + int offset = src.offset / src.elemSize(); + + {// first parallel pass + vector > args; + args.push_back( make_pair( sizeof(cl_mem) , (void *)&src.data)); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&cols )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&invalid_cols )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&offset)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&elemnum)); + args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum)); + size_t globalThreads[3] = {groupnum * 256, 1, 1}; + size_t localThreads[3] = {256, 1, 1}; + openCLExecuteKernel(src.clCxt, &arithm_minMax, "arithm_op_minMax", globalThreads, localThreads, + args, -1, -1, "-D T=float -D DEPTH_5"); + } + + {// run final "serial" kernel to find accumulate results from threads and reset corner counter + vector > args; + args.push_back( make_pair( sizeof(cl_mem) , (void *)&dst_data )); + args.push_back( make_pair( sizeof(cl_int) , (void *)&groupnum )); + args.push_back( make_pair( sizeof(cl_mem) , (void *)&tozero.data )); + size_t globalThreads[3] = {1, 1, 1}; + size_t localThreads[3] = {1, 1, 1}; + openCLExecuteKernel(src.clCxt, &imgproc_gftt, "arithm_op_minMax_final", globalThreads, localThreads, + args, -1, -1); + } } -}//unnamed namespace void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, oclMat& corners, const oclMat& mask) { @@ -205,67 +193,99 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, ensureSizeIsEnough(image.size(), CV_32F, eig_); if (useHarrisDetector) - cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK); + cornerHarris_dxdy(image, eig_, Dx_, Dy_, blockSize, 3, harrisK); else cornerMinEigenVal_dxdy(image, eig_, Dx_, Dy_, blockSize, 3); - double maxVal = 0; - minMax(eig_, NULL, &maxVal); + ensureSizeIsEnough(1,1, CV_32SC1, counter_); - ensureSizeIsEnough(1, std::max(1000, static_cast(image.size().area() * 0.05)), CV_32FC2, tmpCorners_); + // find max eigenvalue and reset detected counters + minMaxEig_caller(eig_,eig_minmax_,counter_); - Ptr eig_tex = bindTexturePtr(eig_); - int total = findCorners_caller( - *eig_tex, - static_cast(maxVal * qualityLevel), + // allocate buffer for kernels + int corner_array_size = std::max(1024, static_cast(image.size().area() * 0.05)); + + if(!use_cpu_sorter) + { // round to 2^n + unsigned int n=1; + for(n=1;n<(unsigned int)corner_array_size;n<<=1); + corner_array_size = (int)n; + + ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_); + + // set to 0 to be able use bitonic sort on whole 2^n array + tmpCorners_.setTo(0); + } + else + { + ensureSizeIsEnough(1, corner_array_size , CV_32FC2, tmpCorners_); + } + + int total = tmpCorners_.cols; // by default the number of corner is full array + vector tmp(tmpCorners_.cols); // input buffer with corner for HOST part of algorithm + + //find points with high eigenvalue and put it into the output array + findCorners_caller( + eig_, + eig_minmax_, + static_cast(qualityLevel), mask, tmpCorners_, - tmpCorners_.cols); + counter_); + + if(!use_cpu_sorter) + {// sort detected corners on deivce side + sortCorners_caller(tmpCorners_, corner_array_size); + } + else + {// send non-blocking request to read real non-zero number of corners to sort it on the HOST side + openCLVerifyCall(clEnqueueReadBuffer(getClCommandQueue(counter_.clCxt), (cl_mem)counter_.data, CL_FALSE, 0,sizeof(int), &total, 0, NULL, NULL)); + } + + //blocking read whole corners array (sorted or not sorted) + openCLReadBuffer(tmpCorners_.clCxt,(cl_mem)tmpCorners_.data,&tmp[0],tmpCorners_.cols*sizeof(DefCorner)); if (total == 0) - { + {// check for trivial case corners.release(); return; } + if(use_cpu_sorter) - { - Sorter::sortCorners_caller(eig_, tmpCorners_, total); - } - else - { - //if total is power of 2 - if(((total - 1) & (total)) == 0) - { - Sorter::sortCorners_caller(*eig_tex, tmpCorners_, total); - } - else - { - Sorter::sortCorners_caller(*eig_tex, tmpCorners_, total); - } + {// sort detected corners on cpu side. + tmp.resize(total); + cv::sort(tmp,DefCornerCompare()); } + //estimate maximal size of final output array + int total_max = maxCorners > 0 ? std::min(maxCorners, total) : total; + int D2 = (int)ceil(minDistance * minDistance); + // allocate output buffer + vector tmp2; + tmp2.reserve(total_max); + + if (minDistance < 1) - { - Rect roi_range(0, 0, maxCorners > 0 ? std::min(maxCorners, total) : total, 1); - tmpCorners_(roi_range).copyTo(corners); + {// we have not distance restriction. then just copy with conversion maximal allowed points into output array + for(int i=0;i0.0f;++i) + { + tmp2.push_back(Point2f(tmp[i].x,tmp[i].y)); + } } else - { - vector tmp(total); - downloadPoints(tmpCorners_, tmp); - - vector tmp2; - tmp2.reserve(total); - + {// we have distance restriction. then start coping to output array from the first element and check distance for each next one const int cell_size = cvRound(minDistance); const int grid_width = (image.cols + cell_size - 1) / cell_size; const int grid_height = (image.rows + cell_size - 1) / cell_size; - std::vector< std::vector > grid(grid_width * grid_height); + std::vector< std::vector > grid(grid_width * grid_height); - for (int i = 0; i < total; ++i) + for (int i = 0; i < total ; ++i) { - Point2f p = tmp[i]; + DefCorner p = tmp[i]; + + if(p.eig<=0.0f) + break; // condition to stop that is needed for GPU bitonic sort usage. bool good = true; @@ -287,40 +307,42 @@ void cv::ocl::GoodFeaturesToTrackDetector_OCL::operator ()(const oclMat& image, { for (int xx = x1; xx <= x2; xx++) { - vector& m = grid[yy * grid_width + xx]; - - if (!m.empty()) + vector& m = grid[yy * grid_width + xx]; + if (m.empty()) + continue; + for(size_t j = 0; j < m.size(); j++) { - for(size_t j = 0; j < m.size(); j++) - { - float dx = p.x - m[j].x; - float dy = p.y - m[j].y; + int dx = p.x - m[j].x; + int dy = p.y - m[j].y; - if (dx * dx + dy * dy < minDistance * minDistance) - { - good = false; - goto break_out; - } + if (dx * dx + dy * dy < D2) + { + good = false; + goto break_out_; } } } } - break_out: + break_out_: if(good) { - grid[y_cell * grid_width + x_cell].push_back(p); + grid[y_cell * grid_width + x_cell].push_back(Point2i(p.x,p.y)); - tmp2.push_back(p); + tmp2.push_back(Point2f(p.x,p.y)); if (maxCorners > 0 && tmp2.size() == static_cast(maxCorners)) break; } } - corners.upload(Mat(1, static_cast(tmp2.size()), CV_32FC2, &tmp2[0])); } + int final_size = static_cast(tmp2.size()); + if(final_size>0) + corners.upload(Mat(1, final_size, CV_32FC2, &tmp2[0])); + else + corners.release(); } void cv::ocl::GoodFeaturesToTrackDetector_OCL::downloadPoints(const oclMat &points, vector &points_v) { diff --git a/modules/ocl/src/opencl/imgproc_gftt.cl b/modules/ocl/src/opencl/imgproc_gftt.cl index 80bdec08f..4d5356cfb 100644 --- a/modules/ocl/src/opencl/imgproc_gftt.cl +++ b/modules/ocl/src/opencl/imgproc_gftt.cl @@ -46,33 +46,26 @@ #ifndef WITH_MASK #define WITH_MASK 0 #endif - -__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; - -inline float ELEM_INT2(image2d_t _eig, int _x, int _y) -{ - return read_imagef(_eig, sampler, (int2)(_x, _y)).x; -} - -inline float ELEM_FLT2(image2d_t _eig, float2 pt) -{ - return read_imagef(_eig, sampler, pt).x; -} +//macro to read eigenvalue matrix +#define GET_SRC_32F(_x, _y) ((__global const float*)(eig + (_y)*eig_pitch))[_x] __kernel void findCorners ( - image2d_t eig, - __global const char * mask, - __global float2 * corners, - const int mask_strip,// in pixels - const float threshold, - const int rows, - const int cols, - const int max_count, - __global int * g_counter + __global const char* eig, + const int eig_pitch, + __global const char* mask, + __global float2* corners, + const int mask_strip,// in pixels + __global const float* pMinMax, + const float qualityLevel, + const int rows, + const int cols, + const int max_count, + __global int* g_counter ) { + float threshold = qualityLevel*pMinMax[1]; const int j = get_global_id(0); const int i = get_global_id(1); @@ -82,39 +75,42 @@ __kernel #endif ) { - const float val = ELEM_INT2(eig, j, i); + const float val = GET_SRC_32F(j, i); if (val > threshold) { float maxVal = val; + maxVal = fmax(GET_SRC_32F(j - 1, i - 1), maxVal); + maxVal = fmax(GET_SRC_32F(j , i - 1), maxVal); + maxVal = fmax(GET_SRC_32F(j + 1, i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j - 1, i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j , i - 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i - 1), maxVal); + maxVal = fmax(GET_SRC_32F(j - 1, i), maxVal); + maxVal = fmax(GET_SRC_32F(j + 1, i), maxVal); - maxVal = fmax(ELEM_INT2(eig, j - 1, i), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i), maxVal); - - maxVal = fmax(ELEM_INT2(eig, j - 1, i + 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j , i + 1), maxVal); - maxVal = fmax(ELEM_INT2(eig, j + 1, i + 1), maxVal); + maxVal = fmax(GET_SRC_32F(j - 1, i + 1), maxVal); + maxVal = fmax(GET_SRC_32F(j , i + 1), maxVal); + maxVal = fmax(GET_SRC_32F(j + 1, i + 1), maxVal); if (val == maxVal) { const int ind = atomic_inc(g_counter); if (ind < max_count) - corners[ind] = (float2)(j, i); + {// pack and store eigenvalue and its coordinates + corners[ind].x = val; + corners[ind].y = as_float(j|(i<<16)); + } } } } } +#undef GET_SRC_32F + //bitonic sort __kernel void sortCorners_bitonicSort ( - image2d_t eig, __global float2 * corners, const int count, const int stage, @@ -140,8 +136,8 @@ __kernel const float2 leftPt = corners[leftId]; const float2 rightPt = corners[rightId]; - const float leftVal = ELEM_FLT2(eig, leftPt); - const float rightVal = ELEM_FLT2(eig, rightPt); + const float leftVal = leftPt.x; + const float rightVal = rightPt.x; const bool compareResult = leftVal > rightVal; @@ -152,124 +148,22 @@ __kernel corners[rightId] = sortOrder ? greater : lesser; } -//selection sort for gfft -//kernel is ported from Bolt library: -//https://github.com/HSA-Libraries/Bolt/blob/master/include/bolt/cl/sort_kernels.cl -// Local sort will firstly sort elements of each workgroup using selection sort -// its performance is O(n) -__kernel - void sortCorners_selectionSortLocal - ( - image2d_t eig, - __global float2 * corners, - const int count, - __local float2 * scratch - ) +// this is simple short serial kernel that makes some short reduction and initialization work +// it makes HOST like work to avoid additional sync with HOST to do this short work +// data - input/output float2. +// input data are sevral (min,max) pairs +// output data is one reduced (min,max) pair +// g_counter - counter that have to be initialized by 0 for next findCorner call. +__kernel void arithm_op_minMax_final(__global float * data, int groupnum,__global int * g_counter) { - int i = get_local_id(0); // index in workgroup - int numOfGroups = get_num_groups(0); // index in workgroup - int groupID = get_group_id(0); - int wg = get_local_size(0); // workgroup size = block size - int n; // number of elements to be processed for this work group - - int offset = groupID * wg; - int same = 0; - corners += offset; - n = (groupID == (numOfGroups-1))? (count - wg*(numOfGroups-1)) : wg; - float2 pt1, pt2; - - pt1 = corners[min(i, n)]; - scratch[i] = pt1; - barrier(CLK_LOCAL_MEM_FENCE); - - if(i >= n) + g_counter[0] = 0; + float minVal = data[0]; + float maxVal = data[groupnum]; + for(int i=1;i val1) - pos++;//calculate the rank of this element in this work group - else - { - if(val1 > val2) - continue; - else - { - // val1 and val2 are same - same++; - } - } - } - for (int j=0; j< same; j++) - corners[pos + j] = pt1; -} -__kernel - void sortCorners_selectionSortFinal - ( - image2d_t eig, - __global float2 * corners, - const int count - ) -{ - const int i = get_local_id(0); // index in workgroup - const int numOfGroups = get_num_groups(0); // index in workgroup - const int groupID = get_group_id(0); - const int wg = get_local_size(0); // workgroup size = block size - int pos = 0, same = 0; - const int offset = get_group_id(0) * wg; - const int remainder = count - wg*(numOfGroups-1); - - if((offset + i ) >= count) - return; - float2 pt1, pt2; - pt1 = corners[groupID*wg + i]; - - float val1 = ELEM_FLT2(eig, pt1); - float val2; - - for(int j=0; j val2) - break; - else - { - //Increment only if the value is not the same. - if( val2 > val1 ) - pos++; - else - same++; - } - } - } - - for(int k=0; k val2) - break; - else - { - //Don't increment if the value is the same. - //Two elements are same if (*userComp)(jData, iData) and (*userComp)(iData, jData) are both false - if(val2 > val1) - pos++; - else - same++; - } - } - for (int j=0; j< same; j++) - corners[pos + j] = pt1; -} + data[0] = minVal; + data[1] = maxVal; +} \ No newline at end of file