What is new in LLVM 17?

November 14, 2023

10 minute read time.

LLVM 17.0.1 release was announced on September 19th, 2023. Traditionally, Arm contributed improved support for Arm targets: code generation to make use of new architecture extensions, improve performance and security of the resulting code. In addition, LLVM tools were improved when targeting embedded use cases.

To find out more about the previous LLVM release, you can read What is new in LLVM 16? blog post.

New architecture and CPU support

128-bit atomic operations

Pablo Barrio

In 2022, LRCPC3 added instructions (ldiapp and stilp) to support atomic loads and stores with strict memory order constraints. LLVM 17 generates these instructions when performing C++ atomic accesses constrained by std::memory_order_acquire and std::memory_order_release.

Fullscreen

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
#include <atomic>
 
std::atomic<__uint128_t> global;
 
void sink(__uint128_t);
 
void swppal_example(__uint128_t x) {
    __uint128_t res = global.exchange(x);
    sink(res);
}
 
void swpp_example(__uint128_t x) {
    __uint128_t res = global.exchange(x, std::memory_order_relaxed);
    sink(res);
}
 
void ldiapp_example() {
    __uint128_t res = global.load(std::memory_order_acquire);
    sink(res);
}
 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

#include <atomic>
 
std::atomic<__uint128_t> global;
 
void sink(__uint128_t);
 
void swppal_example(__uint128_t x) {
    __uint128_t res = global.exchange(x);
    sink(res);
}
 
void swpp_example(__uint128_t x) {
    __uint128_t res = global.exchange(x, std::memory_order_relaxed);
    sink(res);
}
 
void ldiapp_example() {
    __uint128_t res = global.load(std::memory_order_acquire);
    sink(res);
}
 
void stilp_example(__uint128_t x) {
    global.store(x, std::memory_order_release);
}

compiled with -march=armv8.9a+lse128+rcpc3 -O3:

Fullscreen

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
swppal_example(unsigned __int128):                    // @swppal_example(unsigned __int128)
        adrp    x8, global
        add     x8, x8, :lo12:global
        swppal  x0, x1, [x8]
        b       sink(unsigned __int128)
swpp_example(unsigned __int128):                      // @swpp_example(unsigned __int128)
        adrp    x8, global
        add     x8, x8, :lo12:global
        swpp    x0, x1, [x8]
        b       sink(unsigned __int128)
ldiapp_example():                    // @ldiapp_example()
        adrp    x8, global
        add     x8, x8, :lo12:global
        ldiapp  x0, x1, [x8]
        b       sink(unsigned __int128)
stilp_example(unsigned __int128):                     // @stilp_example(unsigned __int128)
        adrp    x8, global
        add     x8, x8, :lo12:global
        stilp   x0, x1, [x8]
        ret
global:
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

swppal_example(unsigned __int128):                    // @swppal_example(unsigned __int128)
        adrp    x8, global
        add     x8, x8, :lo12:global
        swppal  x0, x1, [x8]
        b       sink(unsigned __int128)
swpp_example(unsigned __int128):                      // @swpp_example(unsigned __int128)
        adrp    x8, global
        add     x8, x8, :lo12:global
        swpp    x0, x1, [x8]
        b       sink(unsigned __int128)
ldiapp_example():                    // @ldiapp_example()
        adrp    x8, global
        add     x8, x8, :lo12:global
        ldiapp  x0, x1, [x8]
        b       sink(unsigned __int128)
stilp_example(unsigned __int128):                     // @stilp_example(unsigned __int128)
        adrp    x8, global
        add     x8, x8, :lo12:global
        stilp   x0, x1, [x8]
        ret
global:
        .zero   16

Performance improvements

Complex number autovectorization with SVE

Igor Kirillov

LLVM 17 can now use instructions from SVE and SVE2 to autovectorize operations on complex values.

The following source

Fullscreen

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#include "complex.h"
#define N 1024
 
void complex_function(float _Complex a[restrict N], float _Complex b[restrict N],
                      float _Complex out[restrict N]) {
  for (int i=0; i < N; i++)
      out[i] += a[i] * b[i];
}
   
 
void complex_function2(int _Complex a[restrict N], int _Complex b[restrict N],
                      int _Complex out[restrict N]) {
  for (int i=0; i < N; i++)
      out[i] += a[i] * b[i];
}
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

#include "complex.h"
#define N 1024
 
void complex_function(float _Complex a[restrict N], float _Complex b[restrict N],
                      float _Complex out[restrict N]) {
  for (int i=0; i < N; i++)
      out[i] += a[i] * b[i];
}
   
 
void complex_function2(int _Complex a[restrict N], int _Complex b[restrict N],
                      int _Complex out[restrict N]) {
  for (int i=0; i < N; i++)
      out[i] += a[i] * b[i];
}

when compiled with -Ofast -march=armv9-a generates:

Fullscreen

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
complex_function:                       // @complex_function
        ptrue   p0.b
        mov     x8, xzr
        cntw    x9
        ptrue   p1.s
        rdvl    x10, #2
        mov     w11, #1024                      // =0x400
.LBB0_1:                                // =>This Inner Loop Header: Depth=1
        add     x12, x0, x8
        ld1b    { z0.b }, p0/z, [x0, x8]
        subs    x11, x11, x9
        ld1w    { z1.s }, p1/z, [x12, #1, mul vl]
        add     x12, x1, x8
        ld1b    { z2.b }, p0/z, [x1, x8]
        ld1w    { z3.s }, p1/z, [x12, #1, mul vl]
        add     x12, x2, x8
        ld1b    { z4.b }, p0/z, [x2, x8]
        ld1w    { z5.s }, p1/z, [x12, #1, mul vl]
        fcmla   z4.s, p1/m, z2.s, z0.s, #0
        fcmla   z5.s, p1/m, z3.s, z1.s, #0
        fcmla   z4.s, p1/m, z2.s, z0.s, #90
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

complex_function:                       // @complex_function
        ptrue   p0.b
        mov     x8, xzr
        cntw    x9
        ptrue   p1.s
        rdvl    x10, #2
        mov     w11, #1024                      // =0x400
.LBB0_1:                                // =>This Inner Loop Header: Depth=1
        add     x12, x0, x8
        ld1b    { z0.b }, p0/z, [x0, x8]
        subs    x11, x11, x9
        ld1w    { z1.s }, p1/z, [x12, #1, mul vl]
        add     x12, x1, x8
        ld1b    { z2.b }, p0/z, [x1, x8]
        ld1w    { z3.s }, p1/z, [x12, #1, mul vl]
        add     x12, x2, x8
        ld1b    { z4.b }, p0/z, [x2, x8]
        ld1w    { z5.s }, p1/z, [x12, #1, mul vl]
        fcmla   z4.s, p1/m, z2.s, z0.s, #0
        fcmla   z5.s, p1/m, z3.s, z1.s, #0
        fcmla   z4.s, p1/m, z2.s, z0.s, #90
        fcmla   z5.s, p1/m, z3.s, z1.s, #90
        st1b    { z4.b }, p0, [x2, x8]
        add     x8, x8, x10
        st1w    { z5.s }, p1, [x12, #1, mul vl]
        b.ne    .LBB0_1
        ret
complex_function2:                      // @complex_function2
        ptrue   p0.b
        mov     x8, xzr
        cntw    x9
        ptrue   p1.s
        rdvl    x10, #2
        mov     w11, #1024                      // =0x400
.LBB1_1:                                // =>This Inner Loop Header: Depth=1
        add     x12, x0, x8
        ld1b    { z0.b }, p0/z, [x0, x8]
        subs    x11, x11, x9
        ld1w    { z1.s }, p1/z, [x12, #1, mul vl]
        add     x12, x1, x8
        ld1b    { z2.b }, p0/z, [x1, x8]
        ld1w    { z3.s }, p1/z, [x12, #1, mul vl]
        add     x12, x2, x8
        ld1b    { z4.b }, p0/z, [x2, x8]
        ld1w    { z5.s }, p1/z, [x12, #1, mul vl]
        cmla    z4.s, z0.s, z2.s, #0
        cmla    z5.s, z1.s, z3.s, #0
        cmla    z4.s, z0.s, z2.s, #90
        cmla    z5.s, z1.s, z3.s, #90
        st1b    { z4.b }, p0, [x2, x8]
        add     x8, x8, x10
        st1w    { z5.s }, p1, [x12, #1, mul vl]
        b.ne    .LBB1_1
        ret

Notice the use of the FCMLA instructions from SVE and CMLA instructions from SVE2 used to perform multiply-accumulate operations on complex float or integer values.

LLVM 17 can vectorize many variations of the simple example above. For example, intermediate negations, complex conjugates, complex rotations of one or more of the complex operands, expressions with complex literals. Also, it supports loops with reductions, predication, different tail-folding modes or loops with invariant variables. In LLVM 16 only -O3 -ffp-contract=fast -ffinite-math-only was supported, but now it works with -Ofast.

Improvements in SPEC2017 AArch64 performance

Kyrylo Tkachov

A number of improvements have been added to LLVM 17 that benefit the SPECCPU 2017 benchmarks. Including more aggressive inlining of Fortran builtins in Flang and AArch64 vector code generation improvements giving noticeable gains:

LLVM 17 vs LLVM 16 estimated SPEC2017 intrate improvement on Neoverse V1

Also, continuous investment into Flang features resulted in all of SPEC2017 fprate being compilable with Flang from LLVM 17 for the first time.

SVE vectorization of loops calling math functions with ArmPL

Kyrylo Tkachov

LLVM 17 learned to use SVE vector variants of standard math functions when autovectorizing user code. Making use of the ArmPL library when the user specifies it is available.

For example, the code:

Fullscreen

1
2
3
4
5
6
7
8
9
#include <math.h>
#define N 1024
 
void
do_vsin (float out[restrict N], float in[restrict N])
{
  for (int i = 0; i < N; i++)
    out[i] = sinf (in[i]);
}
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

#include <math.h>
#define N 1024
 
void
do_vsin (float out[restrict N], float in[restrict N])
{
  for (int i = 0; i < N; i++)
    out[i] = sinf (in[i]);
}

when compiled with -Ofast -march=armv9-a -fveclib=ArmPL now generates calls to SVE vector variants of the standard sinf functions, appropriately mangled.

Fullscreen

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
...
.LBB0_1:                                // =>This Inner Loop Header: Depth=1
        mov     p0.b, p4.b
        ld1w    { z0.s }, p4/z, [x19, x21, lsl #2]
        ld1w    { z16.s }, p4/z, [x23, x21, lsl #2]
        bl      armpl_svsin_f32_x
        mov     p0.b, p4.b
        mov     z17.d, z0.d
        mov     z0.d, z16.d
        bl      armpl_svsin_f32_x
        st1w    { z17.s }, p4, [x20, x21, lsl #2]
        st1w    { z0.s }, p4, [x24, x21, lsl #2]
        add     x21, x21, x22
        cmp     x21, #1024
        b.ne    .LBB0_1
...
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

...
.LBB0_1:                                // =>This Inner Loop Header: Depth=1
        mov     p0.b, p4.b
        ld1w    { z0.s }, p4/z, [x19, x21, lsl #2]
        ld1w    { z16.s }, p4/z, [x23, x21, lsl #2]
        bl      armpl_svsin_f32_x
        mov     p0.b, p4.b
        mov     z17.d, z0.d
        mov     z0.d, z16.d
        bl      armpl_svsin_f32_x
        st1w    { z17.s }, p4, [x20, x21, lsl #2]
        st1w    { z0.s }, p4, [x24, x21, lsl #2]
        add     x21, x21, x22
        cmp     x21, #1024
        b.ne    .LBB0_1
...

Code generation improvements

Function sanitizer and KCFI now support Thumb

Simon Tatham

The options -fsanitize=function and -fsanitize=kcfi now work for the Thumb instruction set as well as Arm. The latter for 'kernel control flow integrity', although it does not have to be used in the Linux kernel. Both sanitizers work by loading metadata from just before the start of a function they are about to call. Both were being fooled by the low bit being set in a Thumb function pointer. In LLVM 17 both work.

BTI hardening of switch jump tables

Simon Tatham

Code generation has been hardened for extra security when LLVM automatically generates a jump table as part of a switch statement.

The BTI hardware security feature is intended to limit the options if an attacker corrupts the control flow. Only allowing them to branch to instructions marked with the BTI 'landing pad', instead of any instruction at all, limiting their ability to construct malicious code out of JOP gadgets. When LLVM generates a jump table, the code that looks up and jumps to a table entry typically needs a BTI. The attacker is then given the option to misuse that code by passing an out-of-range jump table index if LLVM had optimized away the range check. Now in BTI mode, LLVM never optimizes away the range check. An attacker trying this can only branch to one of the legitimate cases of the original switch. This cuts down the options of the attacker further.

Speculative-Load-Hardening and MTE stack tagging

Kristof Beyls

Using MTE to tag variables on the stack requires adding extra code in function prologues. Some variants of that extra code was not compatible with the speculative-load-hardening feature. More specifically, when speculative-load-hardening is enabled, only branch instructions should be used that set condition flags. In other words, the instructions {TC}B{N}Z should not be used. MTE stack tagging code generation has been fixed to only used branch instructions that do set condition flags, such that it is compatible with speculative-load-hardening.

Tools improvements

Multilib support

Peter Smith

A toolchain targeting embedded systems like the LLVM Embedded Toolchain for Arm supports many different Arm architectures. Each with different hardware configurations such as whether software or hardware floating point is used. As many of the configurations are at best not optimal or at worst not compatible an embedded toolchain provides several variants of the binary libraries such as the C and C++ library. Each variant being stored in a separate directory. Having the user manually select the right library variant for their configuration is both error prone and a barrier to entry for users of toolchains that automatically select the right library variant. The mechanism for selecting libraries automatically based on compiler command line options is known as multilib. While clang had some existing support for multilib, this was limited to a small number of fixed use cases. Due to the wide-variety of possible library variants an embedded toolchain might have to select from, an alternative data-driven implementation has been added for the bare-metal targets arm-none-eabi and aarch64-none-elf . At the top-level of a library variants a configuration file multilib.yaml describes the library variants and which command line options map to each variant.

With the addition of multilib support the LLVM Embedded Toolchain for Arm can select library variants in a similar way to the Arm GNU Toolchain.

CMSE support in LLD

Amilendra Kodithuwakku

LLD now provides support for Cortex-M Security Extensions (CMSE) according to the Arm v8-M Security Extensions: Requirements on Development Tools version 1.2.

Developers of secure software that incorporate Arm v8-M Security Extensions can now use LLD to generate a secure gateway veneer for each entry function with external linkage. Placing them consecutively in memory either by using linker scripts or via a command-line option. The address of each secure gateway veneer in memory can be shared with non-secure software developers in the form of an import library generated by LLD. By using the import library all entry function calls from non-secure software will be redirected by LLD to go via the secure gateway veneers.

Big-endian support in LLD

Simi Pallipurath

With the addition of big-endian support in LLD, LLVM 17 can generate big-endian format executables for Arm and AArch64. The Arm architecture defines the following big-endian modes.

Arm:

BE8 (Byte-invariant addressing mode):
- Instructions are big-endian in relocatable objects, but little-endian in executables.
- Data is big-endian.
- The data encoding of the ELF file is ELFDATA2MSB.
BE32 (Word-invariant addressing mode):
- Instructions are big-endian in both relocatable objects and executables.
- Data is big-endian.
- The data encoding of the ELF file is ELFDATA2MSB.

AArch64 only has one big-endian mode:

Instructions are little-endian in both relocatable objects and executables.
Data is big-endian.
The data encoding of the ELF file is ELFDATA2MSB.

When linking a big-endian image for Arm, clang selects between BE8 and BE32 formats. The default is dependent on the selected target architecture. For Armv6 and later architectures the default is BE8, for older architectures the default is BE32.

Flang SPEC2017 fprate improvements

Kiran Chandramohan

Fortran permits passing arrays as arguments where the formal/dummy argument takes on the extents of the actual argument. Strides of an array can also be passed in this way. The default code generation has to always account for the fact that there could be a stride. Addressing of these stride array elements in loops have to account for the stride which will involve a fetch from the descriptor of the array. A pass was added to LLVM 17 to version these loops operating on Assumed-shape arrays, separate versions are created for the contiguous case and the strided case. Contiguous arrays are easier to transform and vectorize. While strided arrays can be vectorized with scatter-gather instructions, the performance might not be good. This new pass helped to improve the performance of 554.roms significantly.

Fortran runtime functions in Flang are written in a generic fashion to cater for various types, arrays, etc. Generating specialized versions of these functions improves performance, makes it easier to further optimize and inline them. Inlining of Count, Any, All, Minloc provided most significant improvement for 548.exchange2.

Comdat support was added for globals so that they work on Windows. To achieve this in Flang, Comdat support was added to the MLIR LLVM dialect. Support was also added for a few Fortran language intrinsics, such as move_alloc, tand, atand, ieee_is_normal , isnan, ieee_is_nan.

Further, some OpenMP issues were fixed and privatization support was added for allocatables and pointers.

0 comments
0 members are here

Tools, Software and IDEs blog

Product update: Arm Development Studio 2025.0 now available

Stephen Theobald

Arm Development Studio 2025.0 now available with Arm Toolchain for Embedded Professional.
- July 18, 2025
GCC 15: Continuously Improving

Tamar Christina

GCC 15 brings major Arm optimizations: enhanced vectorization, FP8 support, Neoverse tuning, and 3–5% performance gains on SPEC CPU 2017.
- June 26, 2025
GitHub and Arm are transforming development on Windows for developers

Pareena Verma

Develop, test, and deploy natively on Windows on Arm with GitHub-hosted Arm runners—faster CI/CD, AI tooling, and full dev stack, no emulation needed.
- May 20, 2025

AI blog

Announcements

Architectures and Processors blog

Automotive blog

Embedded and Microcontrollers blog

Internet of Things (IoT) blog

Laptops and Desktops blog

Mobile, Graphics, and Gaming blog

Operating Systems blog

Servers and Cloud Computing blog

SoC Design and Simulation blog

Tools, Software and IDEs blog

What is new in LLVM 17?

New architecture and CPU support

128-bit atomic operations

Performance improvements

Complex number autovectorization with SVE

Improvements in SPEC2017 AArch64 performance

SVE vectorization of loops calling math functions with ArmPL

Code generation improvements

Function sanitizer and KCFI now support Thumb

BTI hardening of switch jump tables

Speculative-Load-Hardening and MTE stack tagging

Tools improvements

Multilib support

CMSE support in LLD

Big-endian support in LLD

Flang SPEC2017 fprate improvements

Product update: Arm Development Studio 2025.0 now available

GCC 15: Continuously Improving

GitHub and Arm are transforming development on Windows for developers