I am running a simple program on my Mali T628 to test the GPU performance versus CPU.
I developed a code to add two integer arrays and it works fine. However, when I start increasing the array size up to 16384 my kernel doesn't execute right and I get and error when I apply clWaitForEvents(). and clGetEventInfo which says CL_INVALID_VALUE.
My global work size is 16384,
My local work-group size is 256 (know by calling clGetKernelWorkGroupInfo())
I check other specifications like memory and cache and it should work. The compiler is supposed to split the global work size and execute it in several steps.
Any idea of the problem?
Thanks
Thanks for your reply,
I attached the full code. Here I put the part that concerns the error:
err = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &event); if (err != CL_SUCCESS) { printf("Error queuing kernel for execution.\n"); //do cleaning up cout << "do clean up"<<err<<endl; return 1; }
cl_int status; err = clWaitForEvents( 1, &event); if( err != CL_SUCCESS ) // 1 of the events in the queue failed { err = clGetEventInfo(event, CL_COMMAND_NDRANGE_KERNEL, sizeof(status), &status, NULL); cout<< "event "<<err<<endl; if(status != CL_COMPLETE ) printf("ERROR executing the kernel %d\n", status);}
I am using the Mali T628 on the Odroid UX4 for linux and using openCL to run my application. As for the driver, I didnt download any driver, I read that the driver was already within the image of the OS when it was shipped.
Best regards
Miguel
#include <iostream> #include <errno.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <CL/cl.h> using namespace std; int main(void) { unsigned int i, j; //iterator variables for loops cl_platform_id platforms[32]; //an array to hold the IDs of all the platforms, hopefuly there won't be more than 32 cl_uint num_platforms; //this number will hold the number of platforms on this machine char vendor[1024]; //this strirng will hold a platforms vendor cl_device_id devices[32]; //this variable holds the number of devices for each platform, hopefully it won't be more than 32 per platform cl_uint num_devices; //this number will hold the number of devices on this machine char deviceName[1024]; //this string will hold the devices name cl_uint numberOfCores; //this variable holds the number of cores of on a device cl_long amountOfMemory; //this variable holds the amount of memory on a device cl_uint clockFreq; //this variable holds the clock frequency of a device cl_ulong maxAlocatableMem; //this variable holds the maximum allocatable memory cl_ulong localMem; //this variable holds local memory for a device cl_bool available; //this variable holds if the device is available size_t size_group, size_dimension; cl_uint integerVectorWidth, addr_bits; cl_uint cacheLineSize; cl_ulong globalCacheSize; cl_context context = 0; cl_int err; cl_command_queue commandQueue = 0; cl_program program = 0; cl_kernel kernel = 0; cl_mem memoryObjects[3] = {0, 0, 0}; size_t source_size; char *source_str; FILE *fp; const char fileName[]="hello_world_opencl.cl"; fp= fopen(fileName,"r"); if(!fp) {return -1; } source_str = (char *)malloc(0x100000); //improve this parameter source_size = fread(source_str, 1, 0x100000, fp); fclose(fp); //get the number of platforms clGetPlatformIDs (32, platforms, &num_platforms); printf("\nNumber of platforms:\t%u\n\n", num_platforms); //this is a loop for platforms for(i = 0; i < num_platforms; i++) { printf("Platform:\t\t%u\n\n", i); clGetPlatformInfo (platforms[i], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL); printf("\tPlatform Vendor:\t%s\n", vendor); clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, sizeof(devices), devices, &num_devices); printf("\tNumber of devices:\t%u\n\n", num_devices); //this is a loop for devices for(j = 0; j < num_devices; j++) { //scan in device information clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numberOfCores), &numberOfCores, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(amountOfMemory), &amountOfMemory, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clockFreq), &clockFreq, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxAlocatableMem), &maxAlocatableMem, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(localMem), &localMem, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_AVAILABLE, sizeof(available), &available, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_dimension), &size_dimension, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_group), &size_group, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &integerVectorWidth, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), &addr_bits, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(globalCacheSize), &globalCacheSize, NULL); clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheLineSize), &cacheLineSize, NULL); //print out device information printf("\tDevice: %u\n", j); printf("\t\tName:\t\t\t\t%s\n", deviceName); printf("\t\tVendor:\t\t\t\t%s\n", vendor); printf("\t\tAvailable:\t\t\t%s\n", available ? "Yes" : "No"); printf("\t\tCompute Units:\t\t\t%u\n", numberOfCores); printf("\t\tClock Frequency:\t\t%u mHz\n", clockFreq); printf("\t\tGlobal Memory:\t\t\t%0.00f mb\n", (double)amountOfMemory/1048576); printf("\t\tMax Allocateable Memory:\t%0.00f mb\n", (double)maxAlocatableMem/1048576); printf("\t\tLocal Memory:\t\t\t%u kb\n", (unsigned int)localMem); printf("\t\tWork item per dimension:\t%u\n", (unsigned int)size_dimension); printf("\t\tWork item per group:\t\t%u\n", (unsigned int)size_group); printf("\t\tVector width for integers\t%u\n", integerVectorWidth); printf("\t\tCache line:\t\t\t%u\n", cacheLineSize); printf("\t\tCache Size:\t\t\t%u\n", (unsigned int)globalCacheSize); } } // Create an OpenCL context context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &err); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } // Create a command-queue on the first device available // on the created context commandQueue = clCreateCommandQueue(context, devices[0], 0, &err); if (commandQueue == NULL) { //do cleaning up cout << "do clean up"; return 1; } // Create program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &err); if (program == NULL) { //do cleaning up cout << "do clean up"; return 1; } // Build program err = clBuildProgram(program, 1, &devices[0], "", NULL, NULL); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } // Create OpenCL kernel kernel = clCreateKernel(program, "hello_world_opencl", NULL); //try clCreateKernelsInProgram if (kernel == NULL) { //do cleaning up cout << "do clean up"; return 1; } /* [Setup memory] */ /* Number of elements in the arrays of input and output data. */ cl_int arraySize = 16384; /* The buffers are the size of the arrays. */ size_t bufferSize = arraySize * sizeof(cl_int); /* * Ask the OpenCL implementation to allocate buffers for the data. * We ask the OpenCL implemenation to allocate memory rather than allocating * it on the CPU to avoid having to copy the data later. * The read/write flags relate to accesses to the memory from within the kernel. */ // Create Memory objects memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } /* [Map the buffers to pointers] */ /* Map the memory buffers created by the OpenCL implementation to pointers so we can access them on the CPU. */ cl_int* inputA = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &err); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } cl_int* inputB = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &err); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } /* [Map the buffers to pointers] */ /* [Initialize the input data] */ for (int i = 0; i < arraySize; i++) { inputA[i] = i; inputB[i] = i; } /* [Initialize the input data] */ /* [Un-map the buffers] */ /* * Unmap the memory objects as we have finished using them from the CPU side. * We unmap the memory because otherwise: * - reads and writes to that memory from inside a kernel on the OpenCL side are undefined. * - the OpenCL implementation cannot free the memory when it is finished. */ err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[0], inputA, 0, NULL, NULL); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[1], inputB, 0, NULL, NULL); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } // Get size of workgroup size_t workgroup_size; err = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL); if (err != CL_SUCCESS) { //do cleaning up cout << "do clean up"; return 1; } // Queue the kernel up for execution across the array size_t globalWorkSize[1] = { bufferSize/4}; size_t localWorkSize[1] = { size_group}; cl_event event = 0; // Set kernel arguments err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1]); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2]); if (err != CL_SUCCESS) { printf("Error setting kernel arguments.\n"); //do cleaning up cout << "do clean up"<<err<<endl; return 1; } err = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &event); if (err != CL_SUCCESS) { printf("Error queuing kernel for execution.\n"); //do cleaning up cout << "do clean up"<<err<<endl; return 1; } /* Wait for kernel execution completion. */ err = clFinish(commandQueue); if (err != CL_SUCCESS) { printf("Error waiting for execution.\n"); //do cleaning up cout << "do clean up"<<err<<endl; return 1; } cl_int status; err = clWaitForEvents( 1, &event); if( err != CL_SUCCESS ) // 1 of the events in the queue failed { // Optional: Check if it's the kernel err = clGetEventInfo(event, CL_COMMAND_NDRANGE_KERNEL, sizeof(status),&status, NULL); cout<< "event "<<err<<endl; if(status != CL_COMPLETE ) printf("ERROR executing the kernel %d\n", status); } /* Get a pointer to the output data. */ cl_int* output = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, bufferSize, 0, NULL, NULL, &err); if (err != CL_SUCCESS) { printf("Error reading result.\n"); //do cleaning up cout << "do clean up"<<err<<endl; return 1; } /* [Output the results] */ //for (int i = 0; i < arraySize; i++) //{ // cout << "i = " << i << ", output = " << output[i] << "\n"; //} /* Unmap the memory object as we are finished using them from the CPU side. */ err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error unmappping.\n"); //do cleaning up cout << "do clean up"<<err<<endl; return 1; } clReleaseMemObject(memoryObjects[0]); clReleaseMemObject(memoryObjects[1]); clReleaseMemObject(memoryObjects[2]); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(commandQueue); clReleaseContext(context); return 0; }