This is most likely more of a beginner question. I'm struggling to benchmark this MVE-vectorizable function, taken from the CMSIS-NN library:
#include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <arm_acle.h> #include <ARMCM55.h> #include <cachel1_armv7.h> #include <cmsis_gcc.h> #include <cmsis_compiler.h> #include <cmsis_version.h> #include <core_cm55.h> #include <mpu_armv8.h> #include <pmu_armv8.h> #include <arm_mve.h> #include "arm_nnsupportfunctions.h" #include "ref_values.h" int8_t rows[COL_LEN * 4] __attribute__((aligned(16))); int32_t results[4] __attribute__((aligned(16))); const int8_t column[] __attribute__((aligned(16))) = REF_VALUES; void arm_nn_mat_mul_core_4x_s8(const int32_t row_elements, const int32_t offset, const int8_t *row_base, const int8_t *col_base, int32_t *const sum_col, int32_t *const output) { int32_t acc_n0 = 0; int32_t acc_n1 = 0; int32_t acc_n2 = 0; int32_t acc_n3 = 0; const int8_t *ip_row_0 = row_base; const int8_t *ip_row_1 = row_base + offset; const int8_t *ip_row_2 = row_base + (2 * offset); const int8_t *ip_row_3 = row_base + (3 * offset); int32_t sum_tmp = 0; #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) __ASM volatile(" vldrb.8 q0, [%[col]], 16 \n" " wlstp.8 lr, %[cnt], 1f \n" "2: \n" " vaddva.s8 %[sum], q0 \n" " vldrb.8 q1, [%[row0]], 16 \n" " vmladava.s8 %[out0], q0, q1 \n" " vldrb.8 q2, [%[row1]], 16 \n" " vmladava.s8 %[out1], q0, q2 \n" " vldrb.8 q3, [%[row2]], 16 \n" " vmladava.s8 %[out2], q0, q3 \n" " vldrb.8 q4, [%[row3]], 16 \n" " vmladava.s8 %[out3], q0, q4 \n" " vldrb.8 q0, [%[col]], 16 \n" " letp lr, 2b \n" "1: \n" : [col] "+r"(col_base), [sum] "+Te"(sum_tmp), [row0] "+r"(ip_row_0), [row1] "+r"(ip_row_1), [row2] "+r"(ip_row_2), [row3] "+r"(ip_row_3), [out0] "+Te"(acc_n0), [out1] "+Te"(acc_n1), [out2] "+Te"(acc_n2), [out3] "+Te"(acc_n3) : [cnt] "r"(row_elements) : "q0", "q1", "q2", "q3", "q4", "memory", "r14"); #else for (int i = 0; i < row_elements; i++) { int32_t col = col_base[i]; sum_tmp += col; acc_n0 += ip_row_0[i] * col; acc_n1 += ip_row_1[i] * col; acc_n2 += ip_row_2[i] * col; acc_n3 += ip_row_3[i] * col; } #endif output[0] = acc_n0; output[1] = acc_n1; output[2] = acc_n2; output[3] = acc_n3; *sum_col = sum_tmp; } int main(void) { ARM_PMU_Enable(); ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); int cycle_count_before, cycle_count_after, cycle_count; for (size_t i = 0; i < ROW_LEN; i++) { int8_t val = (int8_t) i % 128; rows[i] = val; rows[ROW_LEN + i] = val; rows[ROW_LEN*2 + i] = val; rows[ROW_LEN*3 + i] = val; } int32_t col_sum; cycle_count_before = ARM_PMU_Get_CCNTR(); arm_nn_mat_mul_core_4x_s8(ROW_LEN, ROW_LEN, rows, column, &col_sum, results); cycle_count_after = ARM_PMU_Get_CCNTR(); cycle_count = cycle_count_after - cycle_count_before; printf("Cycles = %d\n", cycle_count); printf("Result = %d %d %d %d Sum: %d\n", results[0], results[1], results[2], results[3], col_sum); while (1); }
where `REF_VALUES` is an array of 1280 random values.
Compiler version: arm-none-eabi-gcc (GNU Arm Embedded Toolchain 10-2020-q4-major) 10.2.1 20201103 (release)
The compiler flags are: -DARMCM55 -mcpu=cortex-m55 -mthumb -mfloat-abi=hard -Os -std=c99 -ffunction-sections -fdata-sections
When run on the Corstone300 MPS2 FVP, this reports 1018 cycles. When I change the optimization level to -O3, the reported cycles rise to 2519. Here is a list of reported cycles for other optimization levels:
-Os: 1018-O1: 1031-O2: 2505-O3: 2519-Ofast: 2519
I have checked the generated assembly and the inner loop looks identical between all version to me. I would be very interested in what could cause this steep drop in performance for higher optimization levels, because it seems very counterintuitive to me.
I have found out that the expected speedup from higher optimization levels indeed occurs when the function is run in a tight loop, rather than just called only once. I assume that this is at least part of the answer.