We are running a survey to help us improve the experience for all of our members. If you see the survey appear, please take the time to tell us about your experience if you can.
This is most likely more of a beginner question. I'm struggling to benchmark this MVE-vectorizable function, taken from the CMSIS-NN library:
#include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <arm_acle.h> #include <ARMCM55.h> #include <cachel1_armv7.h> #include <cmsis_gcc.h> #include <cmsis_compiler.h> #include <cmsis_version.h> #include <core_cm55.h> #include <mpu_armv8.h> #include <pmu_armv8.h> #include <arm_mve.h> #include "arm_nnsupportfunctions.h" #include "ref_values.h" int8_t rows[COL_LEN * 4] __attribute__((aligned(16))); int32_t results[4] __attribute__((aligned(16))); const int8_t column[] __attribute__((aligned(16))) = REF_VALUES; void arm_nn_mat_mul_core_4x_s8(const int32_t row_elements, const int32_t offset, const int8_t *row_base, const int8_t *col_base, int32_t *const sum_col, int32_t *const output) { int32_t acc_n0 = 0; int32_t acc_n1 = 0; int32_t acc_n2 = 0; int32_t acc_n3 = 0; const int8_t *ip_row_0 = row_base; const int8_t *ip_row_1 = row_base + offset; const int8_t *ip_row_2 = row_base + (2 * offset); const int8_t *ip_row_3 = row_base + (3 * offset); int32_t sum_tmp = 0; #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) __ASM volatile(" vldrb.8 q0, [%[col]], 16 \n" " wlstp.8 lr, %[cnt], 1f \n" "2: \n" " vaddva.s8 %[sum], q0 \n" " vldrb.8 q1, [%[row0]], 16 \n" " vmladava.s8 %[out0], q0, q1 \n" " vldrb.8 q2, [%[row1]], 16 \n" " vmladava.s8 %[out1], q0, q2 \n" " vldrb.8 q3, [%[row2]], 16 \n" " vmladava.s8 %[out2], q0, q3 \n" " vldrb.8 q4, [%[row3]], 16 \n" " vmladava.s8 %[out3], q0, q4 \n" " vldrb.8 q0, [%[col]], 16 \n" " letp lr, 2b \n" "1: \n" : [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(ip_row_0), [row1] "+r"(ip_row_1), [row2] "+r"(ip_row_2), [row3] "+r"(ip_row_3), [out0] "+Te"(acc_n0), [out1] "+Te"(acc_n1), [out2] "+Te"(acc_n2), [out3] "+Te"(acc_n3) : [cnt] "r"(row_elements) : "q0", "q1", "q2", "q3", "q4", "memory", "r14"); #else for (int i = 0; i < row_elements; i++) { int32_t col = col_base[i]; sum_tmp += col; acc_n0 += ip_row_0[i] * col; acc_n1 += ip_row_1[i] * col; acc_n2 += ip_row_2[i] * col; acc_n3 += ip_row_3[i] * col; } #endif output[0] = acc_n0; output[1] = acc_n1; output[2] = acc_n2; output[3] = acc_n3; *sum_col = sum_tmp; } int main(void) { ARM_PMU_Enable(); ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); int cycle_count_before, cycle_count_after, cycle_count; for (size_t i = 0; i < ROW_LEN; i++) { int8_t val = (int8_t) i % 128; rows[i] = val; rows[ROW_LEN + i] = val; rows[ROW_LEN*2 + i] = val; rows[ROW_LEN*3 + i] = val; } int32_t col_sum; cycle_count_before = ARM_PMU_Get_CCNTR(); arm_nn_mat_mul_core_4x_s8(ROW_LEN, ROW_LEN, rows, column, &col_sum, results); cycle_count_after = ARM_PMU_Get_CCNTR(); cycle_count = cycle_count_after - cycle_count_before; printf("Cycles = %d\n", cycle_count); printf("Result = %d %d %d %d Sum: %d\n", results[0], results[1], results[2], results[3], col_sum); while (1); }
where `REF_VALUES` is an array of 1280 random values.
Compiler version: arm-none-eabi-gcc (GNU Arm Embedded Toolchain 10-2020-q4-major) 10.2.1 20201103 (release)
The compiler flags are: -DARMCM55 -mcpu=cortex-m55 -mthumb -mfloat-abi=hard -Os -std=c99 -ffunction-sections -fdata-sections
When run on the Corstone300 MPS2 FVP, this reports 1018 cycles. When I change the optimization level to -O3, the reported cycles rise to 2519. Here is a list of reported cycles for other optimization levels:
-Os: 1018-O1: 1031-O2: 2505-O3: 2519-Ofast: 2519
I have checked the generated assembly and the inner loop looks identical between all version to me. I would be very interested in what could cause this steep drop in performance for higher optimization levels, because it seems very counterintuitive to me.