* CONTEXT: Hi! I wrote a CNN inference model (MobileNetV3-Small) in bare metal C and verified its correctness (outputs match PyTorch). I did this on my local machine in Visual Studio Code.I am trying to simulate this C program on ARM Cortex M4 with FPU, and I am using Keil uVision to compile my C program. I am doing this as part of my thesis project where I will be characterizing the performance/energy before and after adding a specialized custom hardware unit.
I am using my group's RTL simulation infrastructure to run the compiled program (obtained from Keil) on the ARM Cortex M4 core/peripherals and visualize the cycle by cycle info on a waveform viewer for validation and debugging.
#include "bneck_config.h" // #include <inttypes.h> #include <stdio.h> #include <stdlib.h> //NOTE: To avoid issues with malloc, all buffers to be used were declared as arrays with the max size that will be needed //ifmap_buf is a float[3072] array in the .h file float ofmap_buf [3072]; //32x32x3: 3 channels of 32x32 each float conv_to_sum_buf [90112]; void convolution2D(float* channel_input, int inputSize, int kernelSize, float* kernel, int stride, float* channel_output) { // Calculate the output size int padding = (kernelSize - 1) / 2; int outputSize = (inputSize + 2 * padding - kernelSize) / stride + 1; for (int i = -padding; i < inputSize - padding; i += stride) { for (int j = -padding; j < inputSize - padding; j += stride) { // Apply convolution at each position float sum = 0.0; for (int ki = 0; ki < kernelSize; ki++) { for (int kj = 0; kj < kernelSize; kj++) { // Check boundaries to handle padding if (i + ki >= 0 && i + ki < inputSize && j + kj >= 0 && j + kj < inputSize) { sum += channel_input[(i + ki)* inputSize + (j + kj)] * kernel[ki * kernelSize + kj]; } } } // Store the result in the output channel int output_i = (i + padding) / stride; int output_j = (j + padding) / stride; channel_output[output_i * outputSize + output_j] = sum; } } } //Depthwise convolution //1 kernel with depth = nb_channels void Depthwise(float* ifmap, int nb_channels, int inputSize, int kernelSize, float* kernels, int stride, float* ofmaps) { for (int i = 0; i < nb_channels; i++) { //int padding = 1; int padding = (kernelSize - 1) / 2; int outputSize = (inputSize + 2 * padding - kernelSize) / stride + 1; convolution2D(&(ifmap[i * inputSize * inputSize]), inputSize, kernelSize, &(kernels[i*kernelSize * kernelSize]), stride, &(ofmaps[i * outputSize * outputSize])); } } //Conv2D layer void Conv2D_layer(float* ifmap, int nb_channels, int inputSize, int kernelSize, float* kernels, int nb_kernels, int stride, float* ofmap) { //nb output channels = nb kernels float sum; int padding = (kernelSize - 1) / 2; int dw_size = (inputSize + 2 * padding - kernelSize) / stride + 1; //float* conv_to_sum = malloc(90112); for (int output_channel = 0; output_channel < nb_kernels; output_channel++) { Depthwise(ifmap, nb_channels, inputSize, kernelSize, &(kernels[output_channel*nb_channels*kernelSize*kernelSize]), stride,(float*)conv_to_sum_buf); for (int y = 0; y < dw_size; y++) { for (int x = 0; x < dw_size; x++) { sum = 0; for (int channel = 0; channel < nb_channels; channel++){ sum = sum + ((float*)conv_to_sum_buf)[channel * dw_size * dw_size + y * dw_size + x]; } ofmap[output_channel * dw_size * dw_size + y * dw_size + x] = sum; } } } } int main (void) { Conv2D_layer((float*) ifmap_buf, 3, 32, 3, (float*)conv0_kernels, 16, 1, (float*)ofmap_buf); while(1){}; }