We are running a survey to help us improve the experience for all of our members. If you see the survey appear, please take the time to tell us about your experience if you can.
* CONTEXT: Hi! I wrote a CNN inference model (MobileNetV3-Small) in bare metal C and verified its correctness (outputs match PyTorch). I did this on my local machine in Visual Studio Code.I am trying to simulate this C program on ARM Cortex M4 with FPU, and I am using Keil uVision to compile my C program. I am doing this as part of my thesis project where I will be characterizing the performance/energy before and after adding a specialized custom hardware unit.
I am using my group's RTL simulation infrastructure to run the compiled program (obtained from Keil) on the ARM Cortex M4 core/peripherals and visualize the cycle by cycle info on a waveform viewer for validation and debugging.
#include "bneck_config.h" // #include <inttypes.h> #include <stdio.h> #include <stdlib.h> //NOTE: To avoid issues with malloc, all buffers to be used were declared as arrays with the max size that will be needed //ifmap_buf is a float[3072] array in the .h file float ofmap_buf [3072]; //32x32x3: 3 channels of 32x32 each float conv_to_sum_buf [90112]; void convolution2D(float* channel_input, int inputSize, int kernelSize, float* kernel, int stride, float* channel_output) { // Calculate the output size int padding = (kernelSize - 1) / 2; int outputSize = (inputSize + 2 * padding - kernelSize) / stride + 1; for (int i = -padding; i < inputSize - padding; i += stride) { for (int j = -padding; j < inputSize - padding; j += stride) { // Apply convolution at each position float sum = 0.0; for (int ki = 0; ki < kernelSize; ki++) { for (int kj = 0; kj < kernelSize; kj++) { // Check boundaries to handle padding if (i + ki >= 0 && i + ki < inputSize && j + kj >= 0 && j + kj < inputSize) { sum += channel_input[(i + ki)* inputSize + (j + kj)] * kernel[ki * kernelSize + kj]; } } } // Store the result in the output channel int output_i = (i + padding) / stride; int output_j = (j + padding) / stride; channel_output[output_i * outputSize + output_j] = sum; } } } //Depthwise convolution //1 kernel with depth = nb_channels void Depthwise(float* ifmap, int nb_channels, int inputSize, int kernelSize, float* kernels, int stride, float* ofmaps) { for (int i = 0; i < nb_channels; i++) { //int padding = 1; int padding = (kernelSize - 1) / 2; int outputSize = (inputSize + 2 * padding - kernelSize) / stride + 1; convolution2D(&(ifmap[i * inputSize * inputSize]), inputSize, kernelSize, &(kernels[i*kernelSize * kernelSize]), stride, &(ofmaps[i * outputSize * outputSize])); } } //Conv2D layer void Conv2D_layer(float* ifmap, int nb_channels, int inputSize, int kernelSize, float* kernels, int nb_kernels, int stride, float* ofmap) { //nb output channels = nb kernels float sum; int padding = (kernelSize - 1) / 2; int dw_size = (inputSize + 2 * padding - kernelSize) / stride + 1; //float* conv_to_sum = malloc(90112); for (int output_channel = 0; output_channel < nb_kernels; output_channel++) { Depthwise(ifmap, nb_channels, inputSize, kernelSize, &(kernels[output_channel*nb_channels*kernelSize*kernelSize]), stride,(float*)conv_to_sum_buf); for (int y = 0; y < dw_size; y++) { for (int x = 0; x < dw_size; x++) { sum = 0; for (int channel = 0; channel < nb_channels; channel++){ sum = sum + ((float*)conv_to_sum_buf)[channel * dw_size * dw_size + y * dw_size + x]; } ofmap[output_channel * dw_size * dw_size + y * dw_size + x] = sum; } } } } int main (void) { Conv2D_layer((float*) ifmap_buf, 3, 32, 3, (float*)conv0_kernels, 16, 1, (float*)ofmap_buf); while(1){}; }