Code executes significantly faster when optimized with -Os than with -O3/-Ofast

This is most likely more of a beginner question. I'm struggling to benchmark this MVE-vectorizable function, taken from the CMSIS-NN library:

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <arm_acle.h>
#include <ARMCM55.h>
#include <cachel1_armv7.h>
#include <cmsis_gcc.h>
#include <cmsis_compiler.h>
#include <cmsis_version.h>
#include <core_cm55.h>
#include <mpu_armv8.h>
#include <pmu_armv8.h>

#include <arm_mve.h>
#include "arm_nnsupportfunctions.h"

#include "ref_values.h"

int8_t rows[COL_LEN * 4] __attribute__((aligned(16)));
int32_t results[4] __attribute__((aligned(16)));

const int8_t column[] __attribute__((aligned(16))) = REF_VALUES;

void arm_nn_mat_mul_core_4x_s8(const int32_t row_elements,
                                     const int32_t offset,
                                     const int8_t *row_base,
                                     const int8_t *col_base,
                                     int32_t *const sum_col,
                                     int32_t *const output)
    int32_t acc_n0 = 0;
    int32_t acc_n1 = 0;
    int32_t acc_n2 = 0;
    int32_t acc_n3 = 0;

    const int8_t *ip_row_0 = row_base;
    const int8_t *ip_row_1 = row_base + offset;
    const int8_t *ip_row_2 = row_base + (2 * offset);
    const int8_t *ip_row_3 = row_base + (3 * offset);
    int32_t sum_tmp = 0;

#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)

    __ASM volatile("   vldrb.8         q0, [%[col]], 16     \n"
                   "   wlstp.8         lr, %[cnt], 1f       \n"
                   "2:                                      \n"
                   "   vaddva.s8      %[sum], q0            \n"
                   "   vldrb.8         q1, [%[row0]], 16    \n"
                   "   vmladava.s8    %[out0], q0, q1       \n"
                   "   vldrb.8         q2, [%[row1]], 16    \n"
                   "   vmladava.s8     %[out1], q0, q2      \n"
                   "   vldrb.8         q3, [%[row2]], 16    \n"
                   "   vmladava.s8     %[out2], q0, q3      \n"
                   "   vldrb.8         q4, [%[row3]], 16    \n"
                   "   vmladava.s8     %[out3], q0, q4      \n"
                   "   vldrb.8         q0, [%[col]], 16     \n"
                   "   letp            lr, 2b               \n"
                   "1:                                      \n"
                   : [col] "+r"(col_base),
                     [sum] "+Te"(sum_tmp),
                     [row0] "+r"(ip_row_0),
                     [row1] "+r"(ip_row_1),
                     [row2] "+r"(ip_row_2),
                     [row3] "+r"(ip_row_3),
                     [out0] "+Te"(acc_n0),
                     [out1] "+Te"(acc_n1),
                     [out2] "+Te"(acc_n2),
                     [out3] "+Te"(acc_n3)
                   : [cnt] "r"(row_elements)
                   : "q0", "q1", "q2", "q3", "q4", "memory", "r14");
    for (int i = 0; i < row_elements; i++)
        int32_t col = col_base[i];
        sum_tmp += col;
        acc_n0 += ip_row_0[i] * col;
        acc_n1 += ip_row_1[i] * col;
        acc_n2 += ip_row_2[i] * col;
        acc_n3 += ip_row_3[i] * col;
    output[0] = acc_n0;
    output[1] = acc_n1;
    output[2] = acc_n2;
    output[3] = acc_n3;

    *sum_col = sum_tmp;

int main(void) {

    int cycle_count_before, cycle_count_after, cycle_count;
    for (size_t i = 0; i < ROW_LEN; i++) {
	int8_t val = (int8_t) i % 128;
        rows[i] = val;
        rows[ROW_LEN + i] = val;
        rows[ROW_LEN*2 + i] = val;
        rows[ROW_LEN*3 + i] = val;
    int32_t col_sum;
    cycle_count_before = ARM_PMU_Get_CCNTR();
    arm_nn_mat_mul_core_4x_s8(ROW_LEN, ROW_LEN, rows, column, &col_sum, results);
    cycle_count_after = ARM_PMU_Get_CCNTR();

    cycle_count = cycle_count_after - cycle_count_before;
    printf("Cycles = %d\n", cycle_count);
    printf("Result = %d %d %d %d Sum: %d\n", results[0], results[1], results[2], results[3], col_sum);
    while (1);

where `REF_VALUES` is an array of 1280 random  values.

Compiler version: arm-none-eabi-gcc (GNU Arm Embedded Toolchain 10-2020-q4-major) 10.2.1 20201103 (release)

The compiler flags are: -DARMCM55 -mcpu=cortex-m55 -mthumb -mfloat-abi=hard -Os -std=c99 -ffunction-sections -fdata-sections

When run on the Corstone300 MPS2 FVP, this reports 1018 cycles. When I change the optimization level to -O3, the reported cycles rise to 2519. Here is a list of reported cycles for other optimization levels:

-Os: 1018
-O1: 1031
-O2: 2505
-O3: 2519
-Ofast: 2519

I have checked the generated assembly and the inner loop looks identical between all version to me. I would be very interested in what could cause this steep drop in performance for higher optimization levels, because it seems very counterintuitive to me.

More questions in this forum