This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

CL_INVALID_VALUE when increasing Global WorkSize

I am running a simple program on my Mali T628 to test the GPU performance versus CPU.

I developed a code to add two integer arrays and  it works fine. However, when I start increasing the array size up to 16384 my kernel doesn't  execute right and I get and error when I apply clWaitForEvents(). and clGetEventInfo which says CL_INVALID_VALUE.

 

My global work size is 16384,

My local work-group size is 256 (know by calling clGetKernelWorkGroupInfo())

I check other specifications like memory and cache and it should work. The compiler is supposed to split the global work size and execute it in several steps.

 

Any idea of the problem?

Thanks

Parents
  • (1) What parameters you passing into the clGetEventInfo() structure which is returning the error?
    (2) Do you have a reproducer you can share?
    (3) What device and driver version are you running on?


    If you are running on Android you can get the driver release information by opening a Chrome browser tab and navigating to "chrome://gpu/"; the GL_VENDOR, GL_RENDERER, and GL_VERSION strings are in the Driver Information table.

    Regards,
    Pete

Reply
  • (1) What parameters you passing into the clGetEventInfo() structure which is returning the error?
    (2) Do you have a reproducer you can share?
    (3) What device and driver version are you running on?


    If you are running on Android you can get the driver release information by opening a Chrome browser tab and navigating to "chrome://gpu/"; the GL_VENDOR, GL_RENDERER, and GL_VERSION strings are in the Driver Information table.

    Regards,
    Pete

Children
  • Thanks for your reply,

    I attached the full code. Here I put the part that concerns the error:

    err = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &event);
    if (err != CL_SUCCESS)
    {
        printf("Error queuing kernel for execution.\n");
        //do cleaning up
        cout << "do clean up"<<err<<endl;
        return 1;
    }

    cl_int status;
    err = clWaitForEvents( 1, &event);
    if( err != CL_SUCCESS ) // 1 of the events in the queue failed
    {
        err = clGetEventInfo(event, CL_COMMAND_NDRANGE_KERNEL, sizeof(status), &status, NULL);
        cout<< "event "<<err<<endl;
        if(status != CL_COMPLETE )
             printf("ERROR executing the kernel %d\n", status);
    }

     

    I am using the Mali T628 on the Odroid UX4 for linux and using openCL to run my application. As for the driver, I didnt download any driver, I read that the driver was already within the image of the OS when it was shipped.

     

    Best regards

     

    Miguel

     

     

    #include <iostream>
    #include <errno.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <unistd.h>
    
    #include <CL/cl.h>
    
    
    using namespace std;                                               
    
    int main(void)
    {
    
    unsigned int i, j;				//iterator variables for loops
    	
    
    	cl_platform_id platforms[32];			//an array to hold the IDs of all the platforms, hopefuly there won't be more than 32
    	cl_uint num_platforms;				//this number will hold the number of platforms on this machine
    	char vendor[1024];				//this strirng will hold a platforms vendor	
    
    	cl_device_id devices[32];			//this variable holds the number of devices for each platform, hopefully it won't be more than 32 per platform
    	cl_uint num_devices;				//this number will hold the number of devices on this machine
    	char deviceName[1024];				//this string will hold the devices name
    	cl_uint numberOfCores;				//this variable holds the number of cores of on a device
    	cl_long amountOfMemory;				//this variable holds the amount of memory on a device
    	cl_uint clockFreq;				//this variable holds the clock frequency of a device
    	cl_ulong maxAlocatableMem;			//this variable holds the maximum allocatable memory
    	cl_ulong localMem;				//this variable holds local memory for a device
    	cl_bool	available;				//this variable holds if the device is available
    	size_t size_group, size_dimension;
    	cl_uint integerVectorWidth, addr_bits;
    	cl_uint cacheLineSize;
            cl_ulong globalCacheSize;
    
    
    	cl_context context = 0;
    	cl_int err;
      	cl_command_queue commandQueue = 0;
     	cl_program program = 0;
    	cl_kernel kernel = 0;
    	cl_mem memoryObjects[3] = {0, 0, 0};
    
        
        	size_t source_size;	
    	char *source_str;
    	FILE *fp;	
    	const char fileName[]="hello_world_opencl.cl";
    	
    	fp= fopen(fileName,"r");
    	if(!fp)
    	{return -1; }
    	source_str = (char *)malloc(0x100000);   //improve this parameter
    	source_size = fread(source_str, 1, 0x100000, fp);
    	fclose(fp);
    	
    
    	//get the number of platforms
    
    	clGetPlatformIDs (32, platforms, &num_platforms);
    	printf("\nNumber of platforms:\t%u\n\n", num_platforms);	
    
    	//this is a loop for platforms
    
    	for(i = 0; i < num_platforms; i++)
    	{
    		printf("Platform:\t\t%u\n\n", i);
    		clGetPlatformInfo (platforms[i], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL);
    		printf("\tPlatform Vendor:\t%s\n", vendor);		
    
    		clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, sizeof(devices), devices, &num_devices);
    		printf("\tNumber of devices:\t%u\n\n", num_devices);
    
    		//this is a loop for devices
    
    		for(j = 0; j < num_devices; j++)
    		{
    			//scan in device information
    			clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numberOfCores), &numberOfCores, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(amountOfMemory), &amountOfMemory, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clockFreq), &clockFreq, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxAlocatableMem), &maxAlocatableMem, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(localMem), &localMem, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_AVAILABLE, sizeof(available), &available, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_dimension), &size_dimension, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_group), &size_group, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &integerVectorWidth, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), &addr_bits, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(globalCacheSize), &globalCacheSize, NULL);
        			clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheLineSize), &cacheLineSize, NULL);
    
    			//print out device information
    			printf("\tDevice: %u\n", j);
    			printf("\t\tName:\t\t\t\t%s\n", deviceName);
    			printf("\t\tVendor:\t\t\t\t%s\n", vendor);
    			printf("\t\tAvailable:\t\t\t%s\n", available ? "Yes" : "No");
    			printf("\t\tCompute Units:\t\t\t%u\n", numberOfCores);
    			printf("\t\tClock Frequency:\t\t%u mHz\n", clockFreq);
    			printf("\t\tGlobal Memory:\t\t\t%0.00f mb\n", (double)amountOfMemory/1048576);
    			printf("\t\tMax Allocateable Memory:\t%0.00f mb\n", (double)maxAlocatableMem/1048576);
    			printf("\t\tLocal Memory:\t\t\t%u kb\n", (unsigned int)localMem);
    			printf("\t\tWork item per dimension:\t%u\n", (unsigned int)size_dimension);
    			printf("\t\tWork item per group:\t\t%u\n", (unsigned int)size_group);
    			printf("\t\tVector width for integers\t%u\n", integerVectorWidth);
    			printf("\t\tCache line:\t\t\t%u\n", cacheLineSize);
    			printf("\t\tCache Size:\t\t\t%u\n", (unsigned int)globalCacheSize);
    		}
    	}
    
    	// Create an OpenCL context
    	context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &err);
    	if (err != CL_SUCCESS)
    	{
    	      //do cleaning up
    		cout << "do clean up";
     	      return 1;
    	}
    	
    	// Create a command-queue on the first device available
    	// on the created context
    	commandQueue = clCreateCommandQueue(context, devices[0], 0, &err);
    	if (commandQueue == NULL)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    	
    	// Create program	
    	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &err);
    	if (program == NULL)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    
    	// Build program
    	err = clBuildProgram(program, 1, &devices[0], "", NULL, NULL);
    	if (err != CL_SUCCESS)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    
    	// Create OpenCL kernel
    	kernel = clCreateKernel(program, "hello_world_opencl", NULL);	//try clCreateKernelsInProgram
    	if (kernel == NULL)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    
    	/* [Setup memory] */
    	/* Number of elements in the arrays of input and output data. */
    	cl_int arraySize = 16384;
    	   
    	/* The buffers are the size of the arrays. */
    	size_t bufferSize = arraySize * sizeof(cl_int);
    	/*
    	   * Ask the OpenCL implementation to allocate buffers for the data.
    	   * We ask the OpenCL implemenation to allocate memory rather than allocating
      	   * it on the CPU to avoid having to copy the data later.
    	   * The read/write flags relate to accesses to the memory from within the kernel.
    	   */
    	
    	// Create Memory objects
    	memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err);
    	if (err != CL_SUCCESS)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    
    	memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err);	
    	if (err != CL_SUCCESS)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    
    	memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err);	
    	if (err != CL_SUCCESS)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    
    	/* [Map the buffers to pointers] */
    	/* Map the memory buffers created by the OpenCL implementation to pointers so we can access them on the CPU. */
    
    	cl_int* inputA = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &err);
    	if (err != CL_SUCCESS)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    	  
    	cl_int* inputB = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &err);
    	if (err != CL_SUCCESS)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    	/* [Map the buffers to pointers] */
    
    	/* [Initialize the input data] */
    	for (int i = 0; i < arraySize; i++)
    	{
    		inputA[i] = i;
    		inputB[i] = i;
    	}
    	/* [Initialize the input data] */
    
    	/* [Un-map the buffers] */
    	/*
    	  * Unmap the memory objects as we have finished using them from the CPU side.
    	  * We unmap the memory because otherwise:
    	  * - reads and writes to that memory from inside a kernel on the OpenCL side are undefined.
    	  * - the OpenCL implementation cannot free the memory when it is finished.
            */
    	err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[0], inputA, 0, NULL, NULL);
    	if (err != CL_SUCCESS)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    	  
    	err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[1], inputB, 0, NULL, NULL);
     	if (err != CL_SUCCESS)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}
    
    	// Get size of workgroup
    	size_t workgroup_size;
    	err = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
    	if (err != CL_SUCCESS)
    	{
    	       //do cleaning up
    		cout << "do clean up";
    	       return 1;
    	}	
    
    	//   Queue the kernel up for execution across the array
    	size_t globalWorkSize[1] = { bufferSize/4};
    	size_t localWorkSize[1] = { size_group};
    	cl_event event = 0;	
    	
    	//   Set kernel arguments		
    	err =  clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0]);
    	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1]);
    	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2]);
    	if (err != CL_SUCCESS)
    	{
    	       printf("Error setting kernel arguments.\n");
    	       //do cleaning up
    		cout << "do clean up"<<err<<endl;
    	       return 1;
    	}	
    
    	err = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &event);
    	if (err != CL_SUCCESS)
    	{
    	       printf("Error queuing kernel for execution.\n");
    	       //do cleaning up
    		cout << "do clean up"<<err<<endl;
    	       return 1;
    	}
    
    	/* Wait for kernel execution completion. */
    	err = clFinish(commandQueue);	
    	if (err != CL_SUCCESS)
    	{
    	       printf("Error waiting for execution.\n");
    	       //do cleaning up
    		cout << "do clean up"<<err<<endl;
    	       return 1;
    	}
    
    	cl_int status;
    	err = clWaitForEvents( 1, &event);
    	if( err != CL_SUCCESS ) // 1 of the events in the queue failed
    	{
    	// Optional: Check if it's the kernel
      	err = clGetEventInfo(event, CL_COMMAND_NDRANGE_KERNEL, sizeof(status),&status, NULL);
      	cout<< "event "<<err<<endl;
     	if(status != CL_COMPLETE )
    	printf("ERROR executing the kernel %d\n", status);
    }
    
    	/* Get a pointer to the output data. */
    	cl_int* output = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, bufferSize, 0, NULL, NULL, &err);
    	if (err != CL_SUCCESS)
    	{
    	       printf("Error reading result.\n");
    	       //do cleaning up
    		cout << "do clean up"<<err<<endl;
    	       return 1;
    	}
    
    	/* [Output the results] */
    	//for (int i = 0; i < arraySize; i++)
    	//{
    	//	cout << "i = " << i << ", output = " <<  output[i] << "\n";
    	//}
    	  
    	/* Unmap the memory object as we are finished using them from the CPU side. */
    	err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL);
    	if (err != CL_SUCCESS)
    	{
    	       printf("Error unmappping.\n");
    	       //do cleaning up
    		cout << "do clean up"<<err<<endl;
    	       return 1;
    	}
    	
    	clReleaseMemObject(memoryObjects[0]);
    	clReleaseMemObject(memoryObjects[1]);
    	clReleaseMemObject(memoryObjects[2]);
    	clReleaseKernel(kernel);
    	clReleaseProgram(program);
    	clReleaseCommandQueue(commandQueue);
    	clReleaseContext(context);
    return 0;
    	
    
    
    }