///VectorAdd sample, from the NVidia JumpStart Guide ///http://developer.download.nvidia.com/OpenCL/NVIDIA_OpenCL_JumpStart_Guide.pdf ///Instead of #include we include ///Apart from this include file, all other code should compile and work on OpenCL compliant implementation //#define LOAD_FROM_FILE #ifdef USE_MINICL #include "MiniCL/cl.h" #else //USE_MINICL #ifdef __APPLE__ #include #else #include #endif //__APPLE__ #endif//USE_MINICL #include #include #include #include #include "LinearMath/btMinMax.h" #define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); } size_t wgSize; #ifndef USE_MINICL #define MSTRINGIFY(A) #A const char* stringifiedSourceCL = #include "VectorAddKernels.cl" #else const char* stringifiedSourceCL = ""; #endif char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength) { // locals FILE* pFileStream = NULL; size_t szSourceLength; // open the OpenCL source code file pFileStream = fopen(cFilename, "rb"); if(pFileStream == 0) { return NULL; } size_t szPreambleLength = strlen(cPreamble); // get the length of the source code fseek(pFileStream, 0, SEEK_END); szSourceLength = ftell(pFileStream); fseek(pFileStream, 0, SEEK_SET); // allocate a buffer for the source code string and read it in char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); memcpy(cSourceString, cPreamble, szPreambleLength); fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream); // close the file and return the total length of the combined (preamble + source) string fclose(pFileStream); if(szFinalLength != 0) { *szFinalLength = szSourceLength + szPreambleLength; } cSourceString[szSourceLength + szPreambleLength] = '\0'; return cSourceString; } size_t workitem_size[3]; void printDevInfo(cl_device_id device) { char device_string[1024]; clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL); printf( " Device %s:\n", device_string); // CL_DEVICE_INFO cl_device_type type; clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL); if( type & CL_DEVICE_TYPE_CPU ) printf(" CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_CPU"); if( type & CL_DEVICE_TYPE_GPU ) printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_GPU"); if( type & CL_DEVICE_TYPE_ACCELERATOR ) printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR"); if( type & CL_DEVICE_TYPE_DEFAULT ) printf( " CL_DEVICE_TYPE:\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT"); // CL_DEVICE_MAX_COMPUTE_UNITS cl_uint compute_units; clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); printf( " CL_DEVICE_MAX_COMPUTE_UNITS:\t%d\n", compute_units); // CL_DEVICE_MAX_WORK_GROUP_SIZE clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL); printf( " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%zu / %zu / %zu \n", workitem_size[0], workitem_size[1], workitem_size[2]); } // Main function // ********************************************************************* int main(int argc, char **argv) { void *srcA, *srcB, *dst; // Host buffers for OpenCL test cl_context cxGPUContext; // OpenCL context cl_command_queue cqCommandQue; // OpenCL command que cl_device_id* cdDevices; // OpenCL device list cl_program cpProgram; // OpenCL program cl_kernel ckKernel; // OpenCL kernel cl_mem cmMemObjs[3]; // OpenCL memory buffer objects: 3 for device size_t szGlobalWorkSize[1]; // 1D var for Total # of work items size_t szLocalWorkSize[1]; // 1D var for # of work items in the work group size_t szParmDataBytes; // Byte size of context information cl_int ciErr1, ciErr2; // Error code var int iTestN = 100000 * 8; // Size of Vectors to process int actualGlobalSize = iTestN>>3; // set Global and Local work size dimensions szGlobalWorkSize[0] = iTestN >> 3; // do 8 computations per work item szLocalWorkSize[0]= iTestN>>3; // Allocate and initialize host arrays srcA = (void *)malloc (sizeof(cl_float) * iTestN); srcB = (void *)malloc (sizeof(cl_float) * iTestN); dst = (void *)malloc (sizeof(cl_float) * iTestN); int i; // Initialize arrays with some values for (i=0;i processing outside of the buffer //make sure to check kernel } size_t globalThreads[] = {num_t * workgroupSize}; size_t localThreads[] = {workgroupSize}; localWorkSize[0] = workgroupSize; globalWorkSize[0] = num_t * workgroupSize; localWorkSize[1] = 1; globalWorkSize[1] = 1; // Copy input data from host to GPU and launch kernel ciErr1 |= clEnqueueNDRangeKernel(cqCommandQue, ckKernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); } if (ciErrNum != CL_SUCCESS) { printf("cannot clEnqueueNDRangeKernel\n"); exit(0); } clFinish(cqCommandQue); // Read back results and check accumulated errors ciErr1 |= clEnqueueReadBuffer(cqCommandQue, cmMemObjs[2], CL_TRUE, 0, sizeof(cl_float8) * szGlobalWorkSize[0], dst, 0, NULL, NULL); // Release kernel, program, and memory objects // NOTE: Most properly this should be done at any of the exit points above, but it is omitted elsewhere for clarity. free(cdDevices); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQue); clReleaseContext(cxGPUContext); // print the results int iErrorCount = 0; for (i = 0; i < iTestN; i++) { if (((float*)dst)[i] != ((float*)srcA)[i]+((float*)srcB)[i]) iErrorCount++; } if (iErrorCount) { printf("MiniCL validation FAILED\n"); } else { printf("MiniCL validation SUCCESSFULL\n"); } // Free host memory, close log and return success for (i = 0; i < 3; i++) { clReleaseMemObject(cmMemObjs[i]); } free(srcA); free(srcB); free (dst); printf("Press ENTER to quit\n"); getchar(); } #ifdef USE_MINICL #include "MiniCL/cl_MiniCL_Defs.h" extern "C" { ///GUID_ARG is only used by MiniCL to pass in the guid used by its get_global_id implementation #define MSTRINGIFY(A) A #include "VectorAddKernels.cl" #undef MSTRINGIFY } MINICL_REGISTER(VectorAdd) #endif//USE_MINICL