This discussion has been locked.

You can no longer post new replies to this discussion. If you have a question you can start a new discussion

CL_INVALID_VALUE when increasing Global WorkSize

I am running a simple program on my Mali T628 to test the GPU performance versus CPU.

I developed a code to add two integer arrays and it works fine. However, when I start increasing the array size up to 16384 my kernel doesn't execute right and I get and error when I apply clWaitForEvents(). and clGetEventInfo which says CL_INVALID_VALUE.

My global work size is 16384,

My local work-group size is 256 (know by calling clGetKernelWorkGroupInfo())

I check other specifications like memory and cache and it should work. The compiler is supposed to split the global work size and execute it in several steps.

Any idea of the problem?

Thanks

Parents

Peter Harris over 7 years ago

(1) What parameters you passing into the clGetEventInfo() structure which is returning the error?
(2) Do you have a reproducer you can share?
(3) What device and driver version are you running on?

If you are running on Android you can get the driver release information by opening a Chrome browser tab and navigating to "chrome://gpu/"; the GL_VENDOR, GL_RENDERER, and GL_VERSION strings are in the Driver Information table.

Regards,
Pete
Cancel
Vote up 0 Vote down

Cancel

Reply

Peter Harris over 7 years ago

(1) What parameters you passing into the clGetEventInfo() structure which is returning the error?
(2) Do you have a reproducer you can share?
(3) What device and driver version are you running on?

If you are running on Android you can get the driver release information by opening a Chrome browser tab and navigating to "chrome://gpu/"; the GL_VENDOR, GL_RENDERER, and GL_VERSION strings are in the Driver Information table.

Regards,
Pete
Cancel
Vote up 0 Vote down

Cancel

Children

miguel.deprado over 7 years ago in reply to Peter Harris

Thanks for your reply,

I attached the full code. Here I put the part that concerns the error:

err = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &event);
if (err != CL_SUCCESS)
{
printf("Error queuing kernel for execution.\n");
//do cleaning up
cout << "do clean up"<<err<<endl;
return 1;
}

cl_int status;
err = clWaitForEvents( 1, &event);
if( err != CL_SUCCESS ) // 1 of the events in the queue failed
{
err = clGetEventInfo(event, CL_COMMAND_NDRANGE_KERNEL, sizeof(status), &status, NULL);
cout<< "event "<<err<<endl;
if(status != CL_COMPLETE )
printf("ERROR executing the kernel %d\n", status);
}

I am using the Mali T628 on the Odroid UX4 for linux and using openCL to run my application. As for the driver, I didnt download any driver, I read that the driver was already within the image of the OS when it was shipped.

Best regards

Miguel

#include <iostream>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <CL/cl.h>


using namespace std;                                               

int main(void)
{

unsigned int i, j;				//iterator variables for loops
	

	cl_platform_id platforms[32];			//an array to hold the IDs of all the platforms, hopefuly there won't be more than 32
	cl_uint num_platforms;				//this number will hold the number of platforms on this machine
	char vendor[1024];				//this strirng will hold a platforms vendor	

	cl_device_id devices[32];			//this variable holds the number of devices for each platform, hopefully it won't be more than 32 per platform
	cl_uint num_devices;				//this number will hold the number of devices on this machine
	char deviceName[1024];				//this string will hold the devices name
	cl_uint numberOfCores;				//this variable holds the number of cores of on a device
	cl_long amountOfMemory;				//this variable holds the amount of memory on a device
	cl_uint clockFreq;				//this variable holds the clock frequency of a device
	cl_ulong maxAlocatableMem;			//this variable holds the maximum allocatable memory
	cl_ulong localMem;				//this variable holds local memory for a device
	cl_bool	available;				//this variable holds if the device is available
	size_t size_group, size_dimension;
	cl_uint integerVectorWidth, addr_bits;
	cl_uint cacheLineSize;
        cl_ulong globalCacheSize;


	cl_context context = 0;
	cl_int err;
  	cl_command_queue commandQueue = 0;
 	cl_program program = 0;
	cl_kernel kernel = 0;
	cl_mem memoryObjects[3] = {0, 0, 0};

    
    	size_t source_size;	
	char *source_str;
	FILE *fp;	
	const char fileName[]="hello_world_opencl.cl";
	
	fp= fopen(fileName,"r");
	if(!fp)
	{return -1; }
	source_str = (char *)malloc(0x100000);   //improve this parameter
	source_size = fread(source_str, 1, 0x100000, fp);
	fclose(fp);
	

	//get the number of platforms

	clGetPlatformIDs (32, platforms, &num_platforms);
	printf("\nNumber of platforms:\t%u\n\n", num_platforms);	

	//this is a loop for platforms

	for(i = 0; i < num_platforms; i++)
	{
		printf("Platform:\t\t%u\n\n", i);
		clGetPlatformInfo (platforms[i], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL);
		printf("\tPlatform Vendor:\t%s\n", vendor);		

		clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, sizeof(devices), devices, &num_devices);
		printf("\tNumber of devices:\t%u\n\n", num_devices);

		//this is a loop for devices

		for(j = 0; j < num_devices; j++)
		{
			//scan in device information
			clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numberOfCores), &numberOfCores, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(amountOfMemory), &amountOfMemory, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clockFreq), &clockFreq, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxAlocatableMem), &maxAlocatableMem, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(localMem), &localMem, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_AVAILABLE, sizeof(available), &available, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_dimension), &size_dimension, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_group), &size_group, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &integerVectorWidth, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), &addr_bits, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(globalCacheSize), &globalCacheSize, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheLineSize), &cacheLineSize, NULL);

			//print out device information
			printf("\tDevice: %u\n", j);
			printf("\t\tName:\t\t\t\t%s\n", deviceName);
			printf("\t\tVendor:\t\t\t\t%s\n", vendor);
			printf("\t\tAvailable:\t\t\t%s\n", available ? "Yes" : "No");
			printf("\t\tCompute Units:\t\t\t%u\n", numberOfCores);
			printf("\t\tClock Frequency:\t\t%u mHz\n", clockFreq);
			printf("\t\tGlobal Memory:\t\t\t%0.00f mb\n", (double)amountOfMemory/1048576);
			printf("\t\tMax Allocateable Memory:\t%0.00f mb\n", (double)maxAlocatableMem/1048576);
			printf("\t\tLocal Memory:\t\t\t%u kb\n", (unsigned int)localMem);
			printf("\t\tWork item per dimension:\t%u\n", (unsigned int)size_dimension);
			printf("\t\tWork item per group:\t\t%u\n", (unsigned int)size_group);
			printf("\t\tVector width for integers\t%u\n", integerVectorWidth);
			printf("\t\tCache line:\t\t\t%u\n", cacheLineSize);
			printf("\t\tCache Size:\t\t\t%u\n", (unsigned int)globalCacheSize);
		}
	}

	// Create an OpenCL context
	context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{
	      //do cleaning up
		cout << "do clean up";
 	      return 1;
	}
	
	// Create a command-queue on the first device available
	// on the created context
	commandQueue = clCreateCommandQueue(context, devices[0], 0, &err);
	if (commandQueue == NULL)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}
	
	// Create program	
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &err);
	if (program == NULL)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	// Build program
	err = clBuildProgram(program, 1, &devices[0], "", NULL, NULL);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	// Create OpenCL kernel
	kernel = clCreateKernel(program, "hello_world_opencl", NULL);	//try clCreateKernelsInProgram
	if (kernel == NULL)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	/* [Setup memory] */
	/* Number of elements in the arrays of input and output data. */
	cl_int arraySize = 16384;
	   
	/* The buffers are the size of the arrays. */
	size_t bufferSize = arraySize * sizeof(cl_int);
	/*
	   * Ask the OpenCL implementation to allocate buffers for the data.
	   * We ask the OpenCL implemenation to allocate memory rather than allocating
  	   * it on the CPU to avoid having to copy the data later.
	   * The read/write flags relate to accesses to the memory from within the kernel.
	   */
	
	// Create Memory objects
	memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err);	
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err);	
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	/* [Map the buffers to pointers] */
	/* Map the memory buffers created by the OpenCL implementation to pointers so we can access them on the CPU. */

	cl_int* inputA = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}
	  
	cl_int* inputB = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}
	/* [Map the buffers to pointers] */

	/* [Initialize the input data] */
	for (int i = 0; i < arraySize; i++)
	{
		inputA[i] = i;
		inputB[i] = i;
	}
	/* [Initialize the input data] */

	/* [Un-map the buffers] */
	/*
	  * Unmap the memory objects as we have finished using them from the CPU side.
	  * We unmap the memory because otherwise:
	  * - reads and writes to that memory from inside a kernel on the OpenCL side are undefined.
	  * - the OpenCL implementation cannot free the memory when it is finished.
        */
	err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[0], inputA, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}
	  
	err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[1], inputB, 0, NULL, NULL);
 	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	// Get size of workgroup
	size_t workgroup_size;
	err = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}	

	//   Queue the kernel up for execution across the array
	size_t globalWorkSize[1] = { bufferSize/4};
	size_t localWorkSize[1] = { size_group};
	cl_event event = 0;	
	
	//   Set kernel arguments		
	err =  clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0]);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1]);
	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2]);
	if (err != CL_SUCCESS)
	{
	       printf("Error setting kernel arguments.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}	

	err = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &event);
	if (err != CL_SUCCESS)
	{
	       printf("Error queuing kernel for execution.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}

	/* Wait for kernel execution completion. */
	err = clFinish(commandQueue);	
	if (err != CL_SUCCESS)
	{
	       printf("Error waiting for execution.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}

	cl_int status;
	err = clWaitForEvents( 1, &event);
	if( err != CL_SUCCESS ) // 1 of the events in the queue failed
	{
	// Optional: Check if it's the kernel
  	err = clGetEventInfo(event, CL_COMMAND_NDRANGE_KERNEL, sizeof(status),&status, NULL);
  	cout<< "event "<<err<<endl;
 	if(status != CL_COMPLETE )
	printf("ERROR executing the kernel %d\n", status);
}

	/* Get a pointer to the output data. */
	cl_int* output = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, bufferSize, 0, NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{
	       printf("Error reading result.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}

	/* [Output the results] */
	//for (int i = 0; i < arraySize; i++)
	//{
	//	cout << "i = " << i << ", output = " <<  output[i] << "\n";
	//}
	  
	/* Unmap the memory object as we are finished using them from the CPU side. */
	err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
	       printf("Error unmappping.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}
	
	clReleaseMemObject(memoryObjects[0]);
	clReleaseMemObject(memoryObjects[1]);
	clReleaseMemObject(memoryObjects[2]);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(commandQueue);
	clReleaseContext(context);
return 0;
	


}