Segfault while running openMp on CMSIS NN function

I'm trying to execute through multiple threads the following segment of code, which is part of CMSIS NN lib from ARM, but I'm observing segfault when adding the proper pragmas with openMP. The code is available here

#pragma omp parallel for collapse(2) shared(pOut) firstprivate(pBuffer, dim_im_out, stride, padding, dim_kernel, dim_im_in, out_shift, bias, ch_im_out) for (i_out_y = 0; i_out_y < dim_im_out; i_out_y++) { for (i_out_x = 0; i_out_x < dim_im_out; i_out_x++) { for (i_ker_y = i_out_y * stride - padding; i_ker_y < i_out_y * stride - padding + dim_kernel; i_ker_y++) { for (i_ker_x = i_out_x * stride - padding; i_ker_x < i_out_x * stride - padding + dim_kernel; i_ker_x++) { printf("_%d",omp_get_thread_num()); if (i_ker_y < 0 || i_ker_y >= dim_im_in || i_ker_x < 0 || i_ker_x >= dim_im_in) { /* Equivalent to arm_fill_q15(0, pBuffer, ch_im_in) with assumption: ch_im_in = 3 */ *__SIMD32(pBuffer) = 0x0; *(pBuffer + 2) = 0; pBuffer += 3; } else { /* * Equivalent to: * arm_q7_to_q15_no_shift( (q7_t*)Im_in+(i_ker_y*dim_im_in+i_ker_x)*3, pBuffer, 3); */ const q7_t *pPixel = Im_in + (i_ker_y * dim_im_in + i_ker_x) * 3; q31_t buf = arm_nn_read_q7x4(pPixel); union arm_nnword top; union arm_nnword bottom; top.word = __SXTB16(buf); bottom.word = __SXTB16(__ROR(buf, 8)); *pBuffer++ = top.half_words[0]; *__SIMD32(pBuffer) = __PKHBT(bottom.word, top.word, 0); pBuffer += 2; } } } #pragma omp critical if (pBuffer == bufferA + 2 * 3 * dim_kernel * dim_kernel) { pOut = arm_nn_mat_mult_kernel_q7_q15(wt, bufferA, ch_im_out, 3 * dim_kernel * dim_kernel, bias_shift, out_shift, bias, pOut); /* counter reset */ pBuffer = bufferA; } } }

It looks like the execution goes fine till certain point where the ARM CPU gets lost while scheduling the number of threads... MyWawaVisit.com Survey

... Application end!

[Parallel] RUN: Startup Convolution - Layer 1 _0_0_0_0_0_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3_1_1_1_1_1_3_3_3_3_3_0_0_0_0_0_3_3_3_3_3_1_1_1_1_1_2_2_3_3_3_3_3_1_1_1_1_1_2_2_2_2_2_0_0_0_0_0_3_3_3_3_3_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_2_2_2_2_2_0_0_0_0_0_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_1_1_1_1_1_2_2_2_2_2_0_0_0_0_0_2_2_2_2_2_1_1_1_1_1_3_3_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_0_0_0_0_0_3_3_3_3_3_0_0_0_0_0_3_3_3_3_3_2_2_2_2_2_0_0_0_0_0_3_3_3_3_3_0_0_0_0_0_3_3_3_3_3_1_1_1_1_1_3_3_3_3_3_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3_2_2_2_2_2_0_0_0_0_0_1_1_1_1_1_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3_2_2_2_2_2_3_3_3_3_3_1_1_1_1_1_0_0_0_0_0_2_2_2_2_2_1_1_1_1_1_3_3_3_3_3Segmentation fault