This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

mali_offline_compiler question

1、I found a strange problem, I tested the following two kernels , The main function and the two kernels are as follows.Test platform is Mali -T864.GlobalWorkSize=10000000(10M),The first takes 20ms and the second takes 15ms.

main.cpp

/*
 * This confidential and proprietary software may be used only as
 * authorised by a licensing agreement from ARM Limited
 *    (C) COPYRIGHT 2013 ARM Limited
 *        ALL RIGHTS RESERVED
 * The entire notice above must be reproduced on all authorised
 * copies and copies may only be made to the extent permitted
 * by a licensing agreement from ARM Limited.
 */
#define CL_TARGET_OPENCL_VERSION 120

#include "common.h"
#include "image.h"
#include <stdlib.h>
#include <CL/cl.h>
#include <iostream>

using namespace std;

/**
 * \brief Basic integer array addition implemented in OpenCL.
 * \details A sample which shows how to add two integer arrays and store the result in a third array.
 *          The main calculation code is in an OpenCL kernel which is executed on a GPU device.
 * \return The exit code of the application, non-zero if a problem occurred.
 */
int main(int argc, char *argv[])
{
    cl_int errorNumber;
    cl_device_id device = 0;
  
    //TINIT
    cl_context context = 0;
    cl_command_queue commandQueue = 0;
    cl_program program = 0;
    
    cl_kernel kernel = 0;
    int numberOfMemoryObjects = 3;
    cl_mem memoryObjects[3] = {0, 0, 0};
    

    if (!createContext(&context))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed to create an OpenCL context. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }

    if (!createCommandQueue(context, &commandQueue, &device))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed to create the OpenCL command queue. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }

    if (!createProgram(context, device, "assets/hello_world_opencl.cl", &program))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed to create OpenCL program." << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }

    kernel = clCreateKernel(program, "hello_world_opencl", &errorNumber);
    if (!checkSuccess(errorNumber))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed to create OpenCL kernel. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }
     
    /* [Setup memory] */
    /* Number of elements in the arrays of input and output data. */
    cl_int arraySize = 10000000;

    /* The buffers are the size of the arrays. */
    size_t bufferSize = arraySize * sizeof(cl_int);
    float *tempA=new float[arraySize];
    float *tempB=new float[arraySize];
    
    for(int i=0;i<arraySize;i++)
    {
       tempA[i]=i;
       tempB[i]=i;
    } 

    /*
     * Ask the OpenCL implementation to allocate buffers for the data.
     * We ask the OpenCL implemenation to allocate memory rather than allocating
     * it on the CPU to avoid having to copy the data later.
     * The read/write flags relate to accesses to the memory from within the kernel.
     */
    bool createMemoryObjectsSuccess = true;
    
    memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY , bufferSize, NULL, &errorNumber);
    createMemoryObjectsSuccess &= checkSuccess(errorNumber);
    if (!checkSuccess(clFinish(commandQueue)))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }
    //TIC;
    //memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &errorNumber);
    clEnqueueWriteBuffer(commandQueue, memoryObjects[0], CL_TRUE, 0, arraySize*sizeof(int), tempA, 0, NULL, NULL);
    if (!checkSuccess(clFinish(commandQueue)))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }
    //TOC("mem1 ok");

    
    memoryObjects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY , bufferSize, NULL, &errorNumber);
    createMemoryObjectsSuccess &= checkSuccess(errorNumber);
    if (!checkSuccess(clFinish(commandQueue)))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }

    clEnqueueWriteBuffer(commandQueue, memoryObjects[1], CL_TRUE, 0, arraySize*sizeof(int), tempB, 0, NULL, NULL);
    if (!checkSuccess(clFinish(commandQueue)))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }

    memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSize, NULL, &errorNumber);
    createMemoryObjectsSuccess &= checkSuccess(errorNumber);

    if (!createMemoryObjectsSuccess)
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed to create OpenCL buffer. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }
    if (!checkSuccess(clFinish(commandQueue)))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed waiting for kernel execution to finish. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }

    
    /* [Set the kernel arguments] */
    bool setKernelArgumentsSuccess = true;
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjects[0]));
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[1]));
    setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjects[2]));
    //TOC("set arguments");
   
    /* [Set the kernel arguments] */

    /* An event to associate with the Kernel. Allows us to retrieve profiling information later. */
    cl_event event = 0;

    size_t globalWorksize[1] = {arraySize};
    /* Enqueue the kernel */
    if (!checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorksize, NULL, 0, NULL, &event)))
    {
        cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
        cerr << "Failed enqueuing the kernel. " << __FILE__ << ":"<< __LINE__ << endl;
        return 1;
    }
    /* [Global work size] */

    /* Wait for kernel execution completion. */

    clWaitForEvents(1,&event);
    //TOC("kernel ok");
    /* Print the profiling information for the event. */
    printProfilingInfo(event);
    /* Release the event object. */
    if (!checkSuccess(clReleaseEvent(event)))
    {
       cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
       cerr << "Failed releasing the event object. " << __FILE__ << ":"<< __LINE__ << endl;
       return 1;
    }

    /* Get a pointer to the output data. */
    cl_float* output = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, bufferSize, 0, NULL, NULL, &errorNumber);
    if (!checkSuccess(errorNumber))
    {
       cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
       cerr << "Failed to map buffer. " << __FILE__ << ":"<< __LINE__ << endl;
       return 1;
    }

    /* [Output the results] */
    /* Uncomment the following block to print results. */
    
    for (int i = 0; i < 5; i++)
    {
        cout << "i = " << i << ", output = " <<  output[i] << "\n";
    }
    
    /* [Output the results] */

    /* Unmap the memory object as we are finished using them from the CPU side. */
    if (!checkSuccess(clEnqueueUnmapMemObject(commandQueue, memoryObjects[2], output, 0, NULL, NULL)))
    {
       cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
       cerr << "Unmapping memory objects failed " << __FILE__ << ":"<< __LINE__ << endl;
       return 1;
    }

    /* Release OpenCL objects. */
    cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjects, numberOfMemoryObjects);
}

kernel1.cl

__kernel void hello_world_opencl(__global float* restrict inputA,
                                 __global float* restrict inputB,
                                 __global float* restrict output)
{
    int i = get_global_id(0);
    output[i] = inputA[i] + inputB[i];
}

kernel2.cl

__kernel void hello_world_opencl(__global float* restrict inputA,
                                 __global float* restrict inputB,
                                 __global float* restrict output)
{
    int i = get_global_id(0);
    output[i] = inputA[i]*2 + inputB[i];
}

2、I use mali_offline_compiler to profile them,the two are same shows below ,how to get Instructions Emmited and  Path Cycles?Why Instructions Emmited is twice than Longest Path Cycles ?And in my opinion, the L/S operation should be 3 times,Why four times here?

Mali Offline Compiler v7.0.0 (Build c38421)
Copyright 2007-2019 Arm Limited, all rights reserved
Configuration
=============
Hardware: Mali-T860 r2p0
Driver: Midgard r23p0-00rel0
Shader type: OpenCL Kernel (inferred)
Main shader
===========
Work registers: 1
Uniform registers: 2
Stack spilling: False
   A L/S T Bound
Instructions Emitted: 2.0 4.0 0.0 L/S
Shortest Path Cycles: 1.0 4.0 0.0 L/S
Longest Path Cycles: 1.0 4.0 0.0 L/S
A = Arithmetic, L/S = Load/Store, T = Texture
Shader properties
=================
Uniform computation: False

Parents
  • > Why Instructions Emmited is twice than Longest Path Cycles ?

    Mali-T860 has two arithmetic pipelines, so two instructions emitted can complete in a single cycle.

    > And 
    in my opinion, the L/S operation should be 3 times,Why four times here?

    Because the hardware needs 4 instructions =)

    In terms of making this shader faster, the best advice I can give is to make vector loads and stores, and have each work item operate on a vector of inputs and outputs. 

Reply
  • > Why Instructions Emmited is twice than Longest Path Cycles ?

    Mali-T860 has two arithmetic pipelines, so two instructions emitted can complete in a single cycle.

    > And 
    in my opinion, the L/S operation should be 3 times,Why four times here?

    Because the hardware needs 4 instructions =)

    In terms of making this shader faster, the best advice I can give is to make vector loads and stores, and have each work item operate on a vector of inputs and outputs. 

Children