This blog post describes how to implement the same matrix-matrix multiplication algorithm using three different Arm technologies: Neon, SVE, and SME.
Presenting these three examples together highlights some key differences between the technologies and is intended to help developers who want to port code from Neon or SVE/SVE2 to SME/SME2.
The Neon, SVE/SV2, and SME/SME2 technologies were introduced into the Arm architecture as follows:
These SIMD architecture extensions provide instructions that accelerate a wide range of applications including:
The examples in this blog use intrinsics, which are functions provided by the compiler that correspond to specific Arm instructions. This enables the programmer to write their entire program in C code rather than assembly language.
All 3 of the examples in this blog implement matrix-matrix multiplication. Matrix multiplication takes two input matrices and produces one result matrix by multiplying each element of a row in the first matrix by the corresponding element of a column from the second matrix and summing these products. The result matrix’s dimensions are determined by the number of rows in the first matrix and the number of columns in the second matrix. For example, a 3 x 2 matrix multiplied by a 2 x 3 matrix results in a 3 x 3 matrix.
To multiply two matrices A and B, the number of columns in matrix A must be equal to the number of rows in matrix B. Multiplying matrices A and B results in matrix C.
This example uses Neon intrinsics to perform matrix-matrix multiplication.
The code does the following:
Here is the example code using Neon intrinsics:
void matrix_multiply_neon(float32_t *A, float32_t *B, float32_t *C, uint32_t n, uint32_t m, uint32_t k) { /* * Multiply matrices A and B, store the result in C. * It is the user's responsibility to make sure the matrices are compatible. */ int A_idx; int B_idx; int C_idx; // these are the columns of a 4x4 sub matrix of A float32x4_t A0; float32x4_t A1; float32x4_t A2; float32x4_t A3; // these are the columns of a 4x4 sub matrix of B float32x4_t B0; float32x4_t B1; float32x4_t B2; float32x4_t B3; // these are the columns of a 4x4 sub matrix of C float32x4_t C0; float32x4_t C1; float32x4_t C2; float32x4_t C3; for (int i_idx=0; i_idx<n; i_idx+=4) { for (int j_idx=0; j_idx<m; j_idx+=4) { // Zero accumulators before matrix op C0 = vmovq_n_f32(0); C1 = vmovq_n_f32(0); C2 = vmovq_n_f32(0); C3 = vmovq_n_f32(0); for (int k_idx=0; k_idx<k; k_idx+=4) { // Compute base index to 4x4 block A_idx = i_idx + n*k_idx; B_idx = k*j_idx + k_idx; // Load most current A values in row A0 = vld1q_f32(A+A_idx); A1 = vld1q_f32(A+A_idx+n); A2 = vld1q_f32(A+A_idx+2*n); A3 = vld1q_f32(A+A_idx+3*n); // Multiply accumulate in 4x1 blocks, i.e. each column in C B0 = vld1q_f32(B+B_idx); C0 = vfmaq_laneq_f32(C0, A0, B0, 0); C0 = vfmaq_laneq_f32(C0, A1, B0, 1); C0 = vfmaq_laneq_f32(C0, A2, B0, 2); C0 = vfmaq_laneq_f32(C0, A3, B0, 3); B1 = vld1q_f32(B+B_idx+k); C1 = vfmaq_laneq_f32(C1, A0, B1, 0); C1 = vfmaq_laneq_f32(C1, A1, B1, 1); C1 = vfmaq_laneq_f32(C1, A2, B1, 2); C1 = vfmaq_laneq_f32(C1, A3, B1, 3); B2 = vld1q_f32(B+B_idx+2*k); C2 = vfmaq_laneq_f32(C2, A0, B2, 0); C2 = vfmaq_laneq_f32(C2, A1, B2, 1); C2 = vfmaq_laneq_f32(C2, A2, B2, 2); C2 = vfmaq_laneq_f32(C2, A3, B2, 3); B3 = vld1q_f32(B+B_idx+3*k); C3 = vfmaq_laneq_f32(C3, A0, B3, 0); C3 = vfmaq_laneq_f32(C3, A1, B3, 1); C3 = vfmaq_laneq_f32(C3, A2, B3, 2); C3 = vfmaq_laneq_f32(C3, A3, B3, 3); } // Compute base index for stores C_idx = n*j_idx + i_idx; vst1q_f32(C+C_idx, C0); vst1q_f32(C+C_idx+n, C1); vst1q_f32(C+C_idx+2*n, C2); vst1q_f32(C+C_idx+3*n, C3); } } }
This example uses the following Neon code features:
float32x4_t
vld1q_f32
vfmaq_lane_f32
vst1q_f32
This example uses a fixed block size of 4x4. This means that the input matrices must be multiples of four in both dimensions. You can deal with other sizes of matrices by padding the matrices with zeroes.
For a more detailed explanation of this example see here.
This example uses SVE2 intrinsics to perform matrix-matrix multiplication.
The main difference between the Neon example and this SVE2 example is that SVE2 uses variable length vectors. While the Neon example uses a fixed block size of 4x4 to match the four 32-bit values that fit in a Neon register, we do not know the size of the SVE2 registers until runtime. This means that the code must be vector-length agnostic. The example uses predication to control the number of data values that are operated on by the SVE2 intrinsics. This means that that they fit perfectly in the SVE2 registers, no matter what size has been implemented. The Neon example uses a 32-bit float data type, float32x4_t, where the 4 shows that each Neon register can contain 4 32-bit values. The SVE2 example uses the svfloat32_t data type because the size of the SVE2 registers is not known until runtime.
svfloat32_t
whileit
svld
svlma
svst
Here is the example code using SVE2 intrinsics:
void matrix_multiply_sve(const float32_t *A, const float32_t *B, float32_t *C, uint32_t n, uint32_t m, uint32_t k) { /* * Multiply matrices A and B, store the result in C. * It is the users responsibility to make sure the matrices are compatible. */ int a_idx; int b_idx; int c_idx; // these are the columns of a nx4 sub matrix of A svfloat32_t A0; svfloat32_t A1; svfloat32_t A2; svfloat32_t A3; // these are the columns of a 4x4 sub matrix of B svfloat32_t B0; svfloat32_t B1; svfloat32_t B2; svfloat32_t B3; // these are the columns of a nx4 sub matrix of C svfloat32_t C0; svfloat32_t C1; svfloat32_t C2; svfloat32_t C3; for (int i_idx=0; i_idx<n; i_idx+=svcntw()) { // calculate predicate for this i_idx svbool_t pred = svwhilelt_b32_u32(i_idx, n); for (int j_idx=0; j_idx<m; j_idx+=4) { // zero accumulators before matrix op C0 = svdup_n_f32(0); C1 = svdup_n_f32(0); C2 = svdup_n_f32(0); C3 = svdup_n_f32(0); for (int k_idx=0; k_idx<k; k_idx+=4){ // compute base index to 4x4 block a_idx = i_idx + n*k_idx; b_idx = k*j_idx + k_idx; // load most current a values in row A0 = svld1_f32(pred, A+a_idx); A1 = svld1_f32(pred, A+a_idx+n); A2 = svld1_f32(pred, A+a_idx+2*n); A3 = svld1_f32(pred, A+a_idx+3*n); // multiply accumulate 4x1 blocks, that is each column C B0 = svld1rq_f32(svptrue_b32(), B+b_idx); C0 = svmla_lane_f32(C0,A0,B0,0); C0 = svmla_lane_f32(C0,A1,B0,1); C0 = svmla_lane_f32(C0,A2,B0,2); C0 = svmla_lane_f32(C0,A3,B0,3); B1 = svld1rq_f32(svptrue_b32(), B+b_idx+k); C1 = svmla_lane_f32(C1,A0,B1,0); C1 = svmla_lane_f32(C1,A1,B1,1); C1 = svmla_lane_f32(C1,A2,B1,2); C1 = svmla_lane_f32(C1,A3,B1,3); B2 = svld1rq_f32(svptrue_b32(), B+b_idx+2*k); C2 = svmla_lane_f32(C2,A0,B2,0); C2 = svmla_lane_f32(C2,A1,B2,1); C2 = svmla_lane_f32(C2,A2,B2,2); C2 = svmla_lane_f32(C2,A3,B2,3); B3 = svld1rq_f32(svptrue_b32(), B+b_idx+3*k); C3 = svmla_lane_f32(C3,A0,B3,0); C3 = svmla_lane_f32(C3,A1,B3,1); C3 = svmla_lane_f32(C3,A2,B3,2); C3 = svmla_lane_f32(C3,A3,B3,3); } // compute base index for stores c_idx = n*j_idx + i_idx; svst1_f32(pred, C+c_idx, C0); svst1_f32(pred, C+c_idx+n,C1); svst1_f32(pred, C+c_idx+2*n,C2); svst1_f32(pred, C+c_idx+3*n,C3); } } }
This example uses the following SVE2 code features:
svwhilelt_b32_u32
uint32_t
svld1_f32
svptrue_b32
svld1rq_f32
svmla_lane_f32
svst1_f32
This example uses SME2 assembly instructions to perform matrix-matrix multiplication.
The differences between this SME2 example and the other examples are as follows:
fmopa
Streaming SVE mode, entered using the smstart instruction, enables the SME2 instructions as well as the ZA storage.
smstart
This example takes input matrices matLeft and matRight. The example uses the fact that multiplying two matrices together is the same as summing the outer products for each column of matLeft and each row of matRight in turn.
matLeft
matRight
The initial input matrices are stored in memory as row-major arrays. Matrix multiplication is performed as the sum of the outer product of one column from matLeft and one row from matRight. Because the outer product requires column elements from matLeft, the code rearranges the matLeft data so that that the column elements are stored contiguously in memory. This data rearrangement is not shown here for the sake of brevity but can be seen in the SME programmers guide.
This example contains three nested loops:
Matrix data is loaded to ZA storage from memory using ld1w instructions.
ld1w
The outer product calculation uses the fmopa instruction. Each fmopa instruction reads two SVE Z input vectors and updates an entire SME ZA tile with the result.
2D predication ensures that the bounds of the matrices are not exceeded.
Finally, the st1w instructions write the result from ZA storage to memory.
st1w
This is only a very brief overview of the example. For more details about how the code operates, line-by-line, see the SME programmers guide.
matmul_opt: // matmul_opt(M, K, N, matLeft, matRight, matResult_opt); // x0 : M // x1 : K, lda // x2 : N, ldc, ldb // x3 : &matLeft // x4 : &matRight // x5 : &matResult_opt // x6 : SVLs-2 // x7 : a_ptr pointer // x8 : a_ptr end address // x9 : c_base pointer // x10: c_ptr0 pointer // x11: Exit condition for N loop // x12: M loop counter // x13: Store loop counter // x14: Predicate select index // x15: Exit condition for K loop // x16: b_base pointer // x17: b_ptr pointer // x18: (SVLs+1)*ldc // x19: ldb + SVLs // x20: SVLs*lda + SVLs // x21: c_ptr1 pointer // x22: SVLs*lda // x23: SVLs*ldc // Assumptions: // nbr in matLeft (M): any // nbc in matLeft, nbr in matRight (K): any K > 2 // nbc in matRight (N): any // // Left matrix is pre-arranged. // // 32-bit accumulator mapping with 2x2 tiles processing stp x19, x20, [sp, #-48]! stp x21, x22, [sp, #16] stp x23, x24, [sp, #32] smstart // constants cntw x6 // SVLs mul x22, x6, x1 // SVLs*lda mul x23, x6, x2 // SVLs*ldc add x18, x23, x2 // SVLs*ldc + ldc add x11, x4, x2, lsl #2 // Exit condition for N loop mov x12, #0 cntb x6 // SVLb mov x14, #0 ptrue pn10.b // Predicate as counter for SME2 VLx2 (a_ptr loads) whilelt pn8.s, x12, x0, vlx2 // tiles predicate (M dimension) sub w6, w6, #8 // SVLb-8 .Loop_M: // Extracting tile 0/1 and tile 2/3 predicates (M dimension) from vlx2 predicate. pext { p2.s, p3.s }, pn8[0] mov x16, x4 // b_base mov x9, x5 // c_base whilelt pn9.b, x16, x11, vlx2 // tiles predicate (N dimension) .Loop_N: mov x7, x3 // a_ptr = a_base mov x17, x16 // b_ptr = b_base mov x10, x9 // c_ptr0 = c_base // Extracting tile 0/2 and tile 1/3 predicates (N dimension) from vlx2 predicate. pext { p0.b, p1.b }, pn9[0] add x8, x3, x22, lsl #2 // a_base + SVLs*lda FP32 elms [Bytes] addvl x15, x8, #-1 // Exit condition for K loop ld1w {z1.s}, p2/z, [x7] // Load 1st vector from a_ptr zero {za} ld1w {z2.s-z3.s}, pn9/z, [x17] // Load 2 vectors from b_ptr fmopa za0.s, p2/m, p0/m, z1.s, z2.s // ZA0 += 1st a_ptr vector OP 1st b_ptr vector ld1w {z5.s}, p3/z, [x7, x22, lsl #2] // Load 2nd vector from a_ptr addvl x7, x7, #1 // a_ptr += SVLb [Bytes] .Loop_K: fmopa za2.s, p3/m, p0/m, z5.s, z2.s // ZA2 += 2nd a_ptr vector OP 1st b_ptr vector ld1w {z3.s}, p1/z, [x17, #1, MUL VL] // Load 2nd vector from b_ptr fmopa za1.s, p2/m, p1/m, z1.s, z3.s // ZA1 += 1st a_ptr vector OP 2nd b_ptr vector ld1w {z0.s-z1.s}, pn10/z, [x7] // Load next 2 vectors from a_ptr fmopa za3.s, p3/m, p1/m, z5.s, z3.s // ZA3 += 2nd a_ptr vector OP 2nd b_ptr vector ld1w {z6.s-z7.s}, pn9/z, [x17, x2, lsl #2] // Load next 2 vectors from b_ptr fmopa za0.s, p2/m, p0/m, z0.s, z6.s // ZA0 += 1st a_ptr vector OP 1st b_ptr vector psel pn11, pn10, p3.s[w14, 0] // Select predicate-as-counter ld1w {z4.s-z5.s}, pn11/z, [x7, x22, lsl #2] // Load next 2 vectors from a_ptr fmopa za2.s, p3/m, p0/m, z4.s, z6.s // ZA2 += 2nd a_ptr vector OP 1st b_ptr vector add x17, x17, x2, lsl #3 // b_ptr += 2*ldb FP32 elms [Bytes] fmopa za1.s, p2/m, p1/m, z0.s, z7.s // ZA1 += 1st a_ptr vector OP 2nd b_ptr vector fmopa za3.s, p3/m, p1/m, z4.s, z7.s // ZA3 += 2nd a_ptr vector OP 2nd b_ptr vector ld1w {z2.s-z3.s}, pn9/z, [x17] // Load next 2 vectors from b_ptr fmopa za0.s, p2/m, p0/m, z1.s, z2.s // ZA0 += 1st a_ptr vector OP 1st b_ptr vector addvl x7, x7, #2 // a_ptr += 2*SVLb [Bytes] cmp x7, x15 b.mi .Loop_K fmopa za2.s, p3/m, p0/m, z5.s, z2.s // ZA2 += 2nd a_ptr vector OP 1st b_ptr vector fmopa za1.s, p2/m, p1/m, z1.s, z3.s // ZA1 += 1st a_ptr vector OP 2nd b_ptr vector fmopa za3.s, p3/m, p1/m, z5.s, z3.s // ZA3 += 2nd a_ptr vector OP 2nd b_ptr vector add x17, x17, x2, lsl #2 // b_ptr += 2*ldb FP32 elms [Bytes] cmp x7, x8 b.pl .Ktail_end .Ktail_start: ld1w {z1.s}, p2/z, [x7] ld1w {z2.s-z3.s}, pn9/z, [x17] fmopa za0.s, p2/m, p0/m, z1.s, z2.s ld1w {z5.s}, p3/z, [x7, x22, lsl #2] fmopa za2.s, p3/m, p0/m, z5.s, z2.s fmopa za1.s, p2/m, p1/m, z1.s, z3.s fmopa za3.s, p3/m, p1/m, z5.s, z3.s .Ktail_end: mov w13, #0 psel pn11, pn9, p2.b[w13, 0] psel pn12, pn9, p3.b[w13, 0] // Move from ZA tiles to vectors: z0 = za0h[1], z1 = za1h[1], z2 = za2h[1], z3 = za3h[1] mova { z0.b-z3.b }, za0h.b[w13, 0:3] st1w { z0.s-z1.s }, pn11, [x10] // Store to c_ptr0 st1w { z2.s-z3.s }, pn12, [x10, x23, lsl #2] // Store to c_ptr0 + SVLs*ldc .Loop_store_ZA: psel pn11, pn9, p2.b[w13, 4] psel pn12, pn9, p3.b[w13, 4] mova { z0.b-z3.b }, za0h.b[w13, 4:7] st1w { z0.s-z1.s }, pn11, [x10, x2, lsl #2] // Store to c_ptr0 + ldc st1w { z2.s-z3.s }, pn12, [x10, x18, lsl #2] // Store to c_ptr0 + (SVLs+1)*ldc add x10, x10, x2, lsl #3 // c_ptr0 += 2*ldc FP32 elms [Bytes] add w13, w13, #8 psel pn11, pn9, p2.b[w13, 0] psel pn12, pn9, p3.b[w13, 0] mova { z0.b-z3.b }, za0h.b[w13, 0:3] st1w { z0.s-z1.s }, pn11, [x10] // Store to c_ptr0 st1w { z2.s-z3.s }, pn12, [x10, x23, lsl #2] // Store to c_ptr0 + SVLs*ldc cmp w13, w6 b.mi .Loop_store_ZA psel pn11, pn9, p2.b[w13, 4] psel pn12, pn9, p3.b[w13, 4] mova { z0.b-z3.b }, za0h.b[w13, 4:7] st1w { z0.s-z1.s }, pn11, [x10, x2, lsl #2] // Store to c_ptr0 + ldc st1w { z2.s-z3.s }, pn12, [x10, x18, lsl #2] // Store to c_ptr0 + (SVLs+1)*ldc addvl x9, x9, #2 addvl x16, x16, #2 // b_base += 2*SVLb [Bytes] whilelt pn9.b, x16, x11, vlx2 // tile predicate (N dimension) b.first .Loop_N add x3, x3, x22, lsl #3 // a_base += 2*SVLs*lda FP32 elms [Bytes] add x5, x5, x23, lsl #3 // c_base += 2*SVLs*ldc FP32 elms [Bytes] incw x12, all, mul #2 // M loop counter += 2* SVLs whilelt pn8.s, x12, x0, vlx2 // tiles predicate (M dimension) b.first .Loop_M smstop ldp x23, x24, [sp, #32] ldp x21, x22, [sp, #16] ldp x19, x20, [sp], #48 ret .size matmul_opt, .-matmul_opt
Neon
SVE/SVE2
SME/SME2
Is there any performance data for each method?