Code executes significantly faster when optimized with -Os than with -O3/-Ofast

This is most likely more of a beginner question. I'm struggling to benchmark this MVE-vectorizable function, taken from the CMSIS-NN library:

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <arm_acle.h>
#include <ARMCM55.h>
#include <cachel1_armv7.h>
#include <cmsis_gcc.h>
#include <cmsis_compiler.h>
#include <cmsis_version.h>
#include <core_cm55.h>
#include <mpu_armv8.h>
#include <pmu_armv8.h>

#include <arm_mve.h>
#include "arm_nnsupportfunctions.h"

#include "ref_values.h"

int8_t rows[COL_LEN * 4] __attribute__((aligned(16)));
int32_t results[4] __attribute__((aligned(16)));

const int8_t column[] __attribute__((aligned(16))) = REF_VALUES;

void arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
                                     const int32_t offset,
                                     const int8_t *row_base,
                                     const int8_t *col_base,
                                     int32_t *const sum_col,
                                     int32_t *const output)
{
    int32_t acc_n0 = 0;
    int32_t acc_n1 = 0;
    int32_t acc_n2 = 0;
    int32_t acc_n3 = 0;

    const int8_t *ip_row_0 = row_base;
    const int8_t *ip_row_1 = row_base + offset;
    const int8_t *ip_row_2 = row_base + (2 * offset);
    const int8_t *ip_row_3 = row_base + (3 * offset);
    int32_t sum_tmp = 0;

#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

    __ASM volatile("   vldrb.8         q0, [%[col]], 16     \n"
                   "   wlstp.8         lr, %[cnt], 1f       \n"
                   "2:                                      \n"
                   "   vaddva.s8      %[sum], q0            \n"
                   "   vldrb.8         q1, [%[row0]], 16    \n"
                   "   vmladava.s8    %[out0], q0, q1       \n"
                   "   vldrb.8         q2, [%[row1]], 16    \n"
                   "   vmladava.s8     %[out1], q0, q2      \n"
                   "   vldrb.8         q3, [%[row2]], 16    \n"
                   "   vmladava.s8     %[out2], q0, q3      \n"
                   "   vldrb.8         q4, [%[row3]], 16    \n"
                   "   vmladava.s8     %[out3], q0, q4      \n"
                   "   vldrb.8         q0, [%[col]], 16     \n"
                   "   letp            lr, 2b               \n"
                   "1:                                      \n"
                   : [col] "+r"(col_base),
                     [sum] "+Te"(sum_tmp),
                     [row0] "+r"(ip_row_0),
                     [row1] "+r"(ip_row_1),
                     [row2] "+r"(ip_row_2),
                     [row3] "+r"(ip_row_3),
                     [out0] "+Te"(acc_n0),
                     [out1] "+Te"(acc_n1),
                     [out2] "+Te"(acc_n2),
                     [out3] "+Te"(acc_n3)
                   : [cnt] "r"(row_elements)
                   : "q0", "q1", "q2", "q3", "q4", "memory", "r14");
#else
    for (int i = 0; i < row_elements; i++)
    {
        int32_t col = col_base[i];
        sum_tmp += col;
        acc_n0 += ip_row_0[i] * col;
        acc_n1 += ip_row_1[i] * col;
        acc_n2 += ip_row_2[i] * col;
        acc_n3 += ip_row_3[i] * col;
    }
#endif
    output[0] = acc_n0;
    output[1] = acc_n1;
    output[2] = acc_n2;
    output[3] = acc_n3;

    *sum_col = sum_tmp;
}

int main(void) {
    ARM_PMU_Enable();
    ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk);

    int cycle_count_before, cycle_count_after, cycle_count;
	
    for (size_t i = 0; i < ROW_LEN; i++) {
	int8_t val = (int8_t) i % 128;
        rows[i] = val;
        rows[ROW_LEN + i] = val;
        rows[ROW_LEN*2 + i] = val;
        rows[ROW_LEN*3 + i] = val;
    }
    int32_t col_sum;
	
    cycle_count_before = ARM_PMU_Get_CCNTR();
    arm_nn_mat_mul_core_4x_s8(ROW_LEN, ROW_LEN, rows, column, &col_sum, results);
    cycle_count_after = ARM_PMU_Get_CCNTR();

    cycle_count = cycle_count_after - cycle_count_before;
	
    printf("Cycles = %d\n", cycle_count);
    printf("Result = %d %d %d %d Sum: %d\n", results[0], results[1], results[2], results[3], col_sum);
    while (1);
}

where `REF_VALUES` is an array of 1280 random  values.

Compiler version: arm-none-eabi-gcc (GNU Arm Embedded Toolchain 10-2020-q4-major) 10.2.1 20201103 (release)

The compiler flags are: -DARMCM55 -mcpu=cortex-m55 -mthumb -mfloat-abi=hard -Os -std=c99 -ffunction-sections -fdata-sections

When run on the Corstone300 MPS2 FVP, this reports 1018 cycles. When I change the optimization level to -O3, the reported cycles rise to 2519. Here is a list of reported cycles for other optimization levels:

-Os: 1018
-O1: 1031
-O2: 2505
-O3: 2519
-Ofast: 2519

I have checked the generated assembly and the inner loop looks identical between all version to me. I would be very interested in what could cause this steep drop in performance for higher optimization levels, because it seems very counterintuitive to me.

Parents Reply Children
No data
More questions in this forum