This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

OpenCL : Can the two GPU devices in Mali-T628 work in parallel

Hi,

I have an Odroid XU3 board. And i am trying to program the Mali-T628 GPU on this board with OpenCL.

With the devices example that comes with the Mali SDK, I came to understand that there are two GPU devices on the Mali-T628. One device with 4 compute units and another device with 2 compute units. I was able to use these devices separately and found that the device with 2 compute units is slower than the device with 4 compute units. But I could not get them to run in parallel.

I created separate command queues for these devices and enqueued the kernels(assigning double the work for the larger device). Though the two kernels seems to be put in their queues immediately, the second kernels seems to start execution only after the first completes. From the profiling information, it seems that the kernels are getting executed sequentially.  Profile information given below. Note the queued time for the second kernel.

Profiling information:

Queued time: 0.334ms

Wait time: 21.751ms

Run time: 12246.8ms

Profiling information:

Queued time: 12269.4ms

Wait time: 0.183916ms

Run time: 12494.5ms

Is this sequential execution expected ?


Thanks in advance,
--Kiran

Parents
  • Hello Anthony,

    Good to know that we can use the devices in parallel.

    Code is given below.

    I create the opencl context. Then get the platforms and then get the devices. For each device I call a function initOpenCL which creates the opencl program and kernel and ties the commandqueue to the device. I use slightly modified versions of the functions given in Mali SDK.


    --Kiran

    bool createCommandQueue(cl_context context, cl_command_queue* commandQueue, cl_device_id* device)

    {

        cl_int errorNumber = 0;

        /* Set up the command queue with the selected device. */

        *commandQueue = clCreateCommandQueue(context, *device, CL_QUEUE_PROFILING_ENABLE, &errorNumber);

        if (!checkSuccess(errorNumber))

        {  

            cerr << "Failed to create the OpenCL command queue. " << __FILE__ << ":"<< __LINE__ << endl;

            return false;

        }  

        return true;

    }

    int initOpenCL(cl_context& context, cl_device_id& device, cl_command_queue& commandQueue, cl_program& program, cl_kernel& kernel, cl_mem* memoryObjects)

    {

        const unsigned int numberOfMemoryObjects = 3;

        cl_int errorNumber;

        if (!createCommandQueue(context, &commandQueue, &device))

        {

            cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);

            cerr << "Failed to create the OpenCL command queue. " << __FILE__ << ":"<< __LINE__ << endl;

            return 1;

        } else {

            cout<<"Successfully created commandQ"<<endl ;

        }

        if (!createProgram(context, device, "assets/matmul.cl", &program))

        {

            cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);

            cerr << "Failed to create OpenCL program." << __FILE__ << ":"<< __LINE__ << endl;

            return 1;

        } else {

            cout<<"Successfully created program"<<endl ;

        }

        kernel = clCreateKernel(program, "matmul_simple_loop_switched", &errorNumber);

        if (!checkSuccess(errorNumber))

        {

            cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);

            cerr << "Failed to create OpenCL kernel. " << __FILE__ << ":"<< __LINE__ << endl;

            return 1;

        } else {

            cout<<"Successfully created kernel"<<endl ;

        }

        return 0 ;

    }

    int main()

    {

        cl_uint num_platforms;       

        cl_platform_id platforms[32];

        cl_uint num_devices;            

        cl_device_id devices[32];        

        cl_command_queue commandQueue[32] ;

        cl_program program[32] ;

        cl_kernel kernel[32] ;

        cl_context context = 0;

        createContext(&context)

        clGetPlatformIDs (32, platforms, &num_platforms);

        int i, j ;

        for(i = 0; i < num_platforms; i++) {

                clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, sizeof(devices), devices, &num_devices);

                for(j = 0; j < num_devices; j++)

                {

                        initOpenCL(context,devices[j],commandQueue[j],program[j],kernel[j],memoryObjects) ;

                }

        }

    }

    --Kiran

Reply
  • Hello Anthony,

    Good to know that we can use the devices in parallel.

    Code is given below.

    I create the opencl context. Then get the platforms and then get the devices. For each device I call a function initOpenCL which creates the opencl program and kernel and ties the commandqueue to the device. I use slightly modified versions of the functions given in Mali SDK.


    --Kiran

    bool createCommandQueue(cl_context context, cl_command_queue* commandQueue, cl_device_id* device)

    {

        cl_int errorNumber = 0;

        /* Set up the command queue with the selected device. */

        *commandQueue = clCreateCommandQueue(context, *device, CL_QUEUE_PROFILING_ENABLE, &errorNumber);

        if (!checkSuccess(errorNumber))

        {  

            cerr << "Failed to create the OpenCL command queue. " << __FILE__ << ":"<< __LINE__ << endl;

            return false;

        }  

        return true;

    }

    int initOpenCL(cl_context& context, cl_device_id& device, cl_command_queue& commandQueue, cl_program& program, cl_kernel& kernel, cl_mem* memoryObjects)

    {

        const unsigned int numberOfMemoryObjects = 3;

        cl_int errorNumber;

        if (!createCommandQueue(context, &commandQueue, &device))

        {

            cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);

            cerr << "Failed to create the OpenCL command queue. " << __FILE__ << ":"<< __LINE__ << endl;

            return 1;

        } else {

            cout<<"Successfully created commandQ"<<endl ;

        }

        if (!createProgram(context, device, "assets/matmul.cl", &program))

        {

            cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);

            cerr << "Failed to create OpenCL program." << __FILE__ << ":"<< __LINE__ << endl;

            return 1;

        } else {

            cout<<"Successfully created program"<<endl ;

        }

        kernel = clCreateKernel(program, "matmul_simple_loop_switched", &errorNumber);

        if (!checkSuccess(errorNumber))

        {

            cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);

            cerr << "Failed to create OpenCL kernel. " << __FILE__ << ":"<< __LINE__ << endl;

            return 1;

        } else {

            cout<<"Successfully created kernel"<<endl ;

        }

        return 0 ;

    }

    int main()

    {

        cl_uint num_platforms;       

        cl_platform_id platforms[32];

        cl_uint num_devices;            

        cl_device_id devices[32];        

        cl_command_queue commandQueue[32] ;

        cl_program program[32] ;

        cl_kernel kernel[32] ;

        cl_context context = 0;

        createContext(&context)

        clGetPlatformIDs (32, platforms, &num_platforms);

        int i, j ;

        for(i = 0; i < num_platforms; i++) {

                clGetDeviceIDs (platforms[i], CL_DEVICE_TYPE_ALL, sizeof(devices), devices, &num_devices);

                for(j = 0; j < num_devices; j++)

                {

                        initOpenCL(context,devices[j],commandQueue[j],program[j],kernel[j],memoryObjects) ;

                }

        }

    }

    --Kiran

Children
No data