CL_INVALID_VALUE when increasing Global WorkSize - Graphics, Gaming, and VR forum - Support forums

Thanks for your reply,
I attached the full code. Here I put the part that concerns the error:
err = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &event);
if (err != CL_SUCCESS)
{
printf("Error queuing kernel for execution.\n");
//do cleaning up
cout << "do clean up"<<err<<endl;
return 1;
}
cl_int status;
err = clWaitForEvents( 1, &event);
if( err != CL_SUCCESS ) // 1 of the events in the queue failed
{
err = clGetEventInfo(event, CL_COMMAND_NDRANGE_KERNEL, sizeof(status), &status, NULL);
cout<< "event "<<err<<endl;
if(status != CL_COMPLETE )
printf("ERROR executing the kernel %d\n", status);
}
I am using the Mali T628 on the Odroid UX4 for linux and using openCL to run my application. As for the driver, I didnt download any driver, I read that the driver was already within the image of the OS when it was shipped.
Best regards
Miguel
#include <iostream>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include <CL/cl.h>


using namespace std;                                               

int main(void)
{

unsigned int i, j;				//iterator variables for loops
	

	cl_platform_id platforms[32];			//an array to hold the IDs of all the platforms, hopefuly there won't be more than 32
	cl_uint num_platforms;				//this number will hold the number of platforms on this machine
	char vendor[1024];				//this strirng will hold a platforms vendor	

	cl_device_id devices[32];			//this variable holds the number of devices for each platform, hopefully it won't be more than 32 per platform
	cl_uint num_devices;				//this number will hold the number of devices on this machine
	char deviceName[1024];				//this string will hold the devices name
	cl_uint numberOfCores;				//this variable holds the number of cores of on a device
	cl_long amountOfMemory;				//this variable holds the amount of memory on a device
	cl_uint clockFreq;				//this variable holds the clock frequency of a device
	cl_ulong maxAlocatableMem;			//this variable holds the maximum allocatable memory
	cl_ulong localMem;				//this variable holds local memory for a device
	cl_bool	available;				//this variable holds if the device is available
	size_t size_group, size_dimension;
	cl_uint integerVectorWidth, addr_bits;
	cl_uint cacheLineSize;
        cl_ulong globalCacheSize;


	cl_context context = 0;
	cl_int err;
  	cl_command_queue commandQueue = 0;
 	cl_program program = 0;
	cl_kernel kernel = 0;
	cl_mem memoryObjects[3] = {0, 0, 0};

    
    	size_t source_size;	
	char *source_str;
	FILE *fp;	
	const char fileName[]="hello_world_opencl.cl";
	
	fp= fopen(fileName,"r");
	if(!fp)
	{return -1; }
	source_str = (char *)malloc(0x100000);   //improve this parameter
	source_size = fread(source_str, 1, 0x100000, fp);
	fclose(fp);
	

	//get the number of platforms

	clGetPlatformIDs (32, platforms, &num_platforms);
	printf("\nNumber of platforms:\t%u\n\n", num_platforms);	

	//this is a loop for platforms

	for(i = 0; i < num_platforms; i++)
	{
		printf("Platform:\t\t%u\n\n", i);
		clGetPlatformInfo (platforms[i], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL);
		printf("\tPlatform Vendor:\t%s\n", vendor);		

		clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, sizeof(devices), devices, &num_devices);
		printf("\tNumber of devices:\t%u\n\n", num_devices);

		//this is a loop for devices

		for(j = 0; j < num_devices; j++)
		{
			//scan in device information
			clGetDeviceInfo(devices[j], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_VENDOR, sizeof(vendor), vendor, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numberOfCores), &numberOfCores, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(amountOfMemory), &amountOfMemory, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clockFreq), &clockFreq, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxAlocatableMem), &maxAlocatableMem, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(localMem), &localMem, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_AVAILABLE, sizeof(available), &available, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_dimension), &size_dimension, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_group), &size_group, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &integerVectorWidth, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), &addr_bits, NULL);
			clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(globalCacheSize), &globalCacheSize, NULL);
    			clGetDeviceInfo(devices[j], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cacheLineSize), &cacheLineSize, NULL);

			//print out device information
			printf("\tDevice: %u\n", j);
			printf("\t\tName:\t\t\t\t%s\n", deviceName);
			printf("\t\tVendor:\t\t\t\t%s\n", vendor);
			printf("\t\tAvailable:\t\t\t%s\n", available ? "Yes" : "No");
			printf("\t\tCompute Units:\t\t\t%u\n", numberOfCores);
			printf("\t\tClock Frequency:\t\t%u mHz\n", clockFreq);
			printf("\t\tGlobal Memory:\t\t\t%0.00f mb\n", (double)amountOfMemory/1048576);
			printf("\t\tMax Allocateable Memory:\t%0.00f mb\n", (double)maxAlocatableMem/1048576);
			printf("\t\tLocal Memory:\t\t\t%u kb\n", (unsigned int)localMem);
			printf("\t\tWork item per dimension:\t%u\n", (unsigned int)size_dimension);
			printf("\t\tWork item per group:\t\t%u\n", (unsigned int)size_group);
			printf("\t\tVector width for integers\t%u\n", integerVectorWidth);
			printf("\t\tCache line:\t\t\t%u\n", cacheLineSize);
			printf("\t\tCache Size:\t\t\t%u\n", (unsigned int)globalCacheSize);
		}
	}

	// Create an OpenCL context
	context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{
	      //do cleaning up
		cout << "do clean up";
 	      return 1;
	}
	
	// Create a command-queue on the first device available
	// on the created context
	commandQueue = clCreateCommandQueue(context, devices[0], 0, &err);
	if (commandQueue == NULL)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}
	
	// Create program	
	program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &err);
	if (program == NULL)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	// Build program
	err = clBuildProgram(program, 1, &devices[0], "", NULL, NULL);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	// Create OpenCL kernel
	kernel = clCreateKernel(program, "hello_world_opencl", NULL);	//try clCreateKernelsInProgram
	if (kernel == NULL)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	/* [Setup memory] */
	/* Number of elements in the arrays of input and output data. */
	cl_int arraySize = 16384;
	   
	/* The buffers are the size of the arrays. */
	size_t bufferSize = arraySize * sizeof(cl_int);
	/*
	   * Ask the OpenCL implementation to allocate buffers for the data.
	   * We ask the OpenCL implemenation to allocate memory rather than allocating
  	   * it on the CPU to avoid having to copy the data later.
	   * The read/write flags relate to accesses to the memory from within the kernel.
	   */
	
	// Create Memory objects
	memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err);	
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &err);	
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	/* [Map the buffers to pointers] */
	/* Map the memory buffers created by the OpenCL implementation to pointers so we can access them on the CPU. */

	cl_int* inputA = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}
	  
	cl_int* inputB = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0, bufferSize, 0, NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}
	/* [Map the buffers to pointers] */

	/* [Initialize the input data] */
	for (int i = 0; i < arraySize; i++)
	{
		inputA[i] = i;
		inputB[i] = i;
	}
	/* [Initialize the input data] */

	/* [Un-map the buffers] */
	/*
	  * Unmap the memory objects as we have finished using them from the CPU side.
	  * We unmap the memory because otherwise:
	  * - reads and writes to that memory from inside a kernel on the OpenCL side are undefined.
	  * - the OpenCL implementation cannot free the memory when it is finished.
        */
	err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[0], inputA, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}
	  
	err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[1], inputB, 0, NULL, NULL);
 	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}

	// Get size of workgroup
	size_t workgroup_size;
	err = clGetKernelWorkGroupInfo(kernel, devices[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
	if (err != CL_SUCCESS)
	{
	       //do cleaning up
		cout << "do clean up";
	       return 1;
	}	

	//   Queue the kernel up for execution across the array
	size_t globalWorkSize[1] = { bufferSize/4};
	size_t localWorkSize[1] = { size_group};
	cl_event event = 0;	
	
	//   Set kernel arguments		
	err =  clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0]);
	err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1]);
	err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2]);
	if (err != CL_SUCCESS)
	{
	       printf("Error setting kernel arguments.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}	

	err = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &event);
	if (err != CL_SUCCESS)
	{
	       printf("Error queuing kernel for execution.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}

	/* Wait for kernel execution completion. */
	err = clFinish(commandQueue);	
	if (err != CL_SUCCESS)
	{
	       printf("Error waiting for execution.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}

	cl_int status;
	err = clWaitForEvents( 1, &event);
	if( err != CL_SUCCESS ) // 1 of the events in the queue failed
	{
	// Optional: Check if it's the kernel
  	err = clGetEventInfo(event, CL_COMMAND_NDRANGE_KERNEL, sizeof(status),&status, NULL);
  	cout<< "event "<<err<<endl;
 	if(status != CL_COMPLETE )
	printf("ERROR executing the kernel %d\n", status);
}

	/* Get a pointer to the output data. */
	cl_int* output = (cl_int*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, bufferSize, 0, NULL, NULL, &err);
	if (err != CL_SUCCESS)
	{
	       printf("Error reading result.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}

	/* [Output the results] */
	//for (int i = 0; i < arraySize; i++)
	//{
	//	cout << "i = " << i << ", output = " <<  output[i] << "\n";
	//}
	  
	/* Unmap the memory object as we are finished using them from the CPU side. */
	err = clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL);
	if (err != CL_SUCCESS)
	{
	       printf("Error unmappping.\n");
	       //do cleaning up
		cout << "do clean up"<<err<<endl;
	       return 1;
	}
	
	clReleaseMemObject(memoryObjects[0]);
	clReleaseMemObject(memoryObjects[1]);
	clReleaseMemObject(memoryObjects[2]);
	clReleaseKernel(kernel);
	clReleaseProgram(program);
	clReleaseCommandQueue(commandQueue);
	clReleaseContext(context);
return 0;
	


}