Skip navigation


1 2 3 Previous Next

ARM Processors

393 posts

NE10 fft_float32 result wrong

Posted by ufo Oct 20, 2016

When i use an ARM cortex-a9 CPU with NEON to test NE10 library,I got a wrong fft result.

My CPU is NXP I.MX6Q run in 1GHZ,my program is comiled with gcc-4.6.2.

NE10 lib is compiled with arm-linux-gnueabihf-gcc 4.9,out libraryfile is shared and static libary.

My test code :

#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include <string.h>


#include "NE10_dsp.h"

#include "NE10_macros.h"

#include "seatest.h"

#include "unit_test_common.h"

#include "ne_alloc.h"

/* ----------------------------------------------------------------------

** Global defines

** ------------------------------------------------------------------- */

#define TEST_FREQ (50)

#define TEST_LENGTH_SAMPLES (1024)


/* ----------------------------------------------------------------------

** Test input data for F32

** Generated by the MATLAB rand() function

** ------------------------------------------------------------------- */


static ne10_float32_t testInput_f32[TEST_LENGTH_SAMPLES * 2];

static ne10_float32_t out_amp_f32[TEST_LENGTH_SAMPLES * 2];


static ne10_float32_t y_out[TEST_LENGTH_SAMPLES];

/* ----------------------------------------------------------------------

** Defines each of the tests performed

** ------------------------------------------------------------------- */


//input and output

static ne10_float32_t * in_c = NULL;

static ne10_float32_t * in_neon = NULL;


static ne10_float32_t * out_c = NULL;

static ne10_float32_t * out_neon = NULL;


static ne10_fft_cfg_float32_t cfg_c;

static ne10_fft_cfg_float32_t cfg_neon;


void genarate_signal(float *complex_float_list,int freq,int total_num)


       int ii;

       for(ii = 0;ii<total_num;ii++)


            complex_float_list[2*ii] = 100*(float)cos(2*ii*PI*freq/total_num);

            complex_float_list[2*ii+1] = 0;




void test_fft_c2c_1d_float32_performance()


    ne10_int32_t fftSize = TEST_LENGTH_SAMPLES;

    ne10_int32_t flag_result = NE10_OK;

    fprintf (stdout, "----------%30s start\n", __FUNCTION__);

  /* FFT test */

  memcpy (in_c, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));

  memcpy (in_neon, testInput_f32, 2 * fftSize * sizeof (ne10_float32_t));

    cfg_c = ne10_fft_alloc_c2c_float32_c (fftSize);

    if (cfg_c == NULL)


        fprintf (stdout, "======ERROR, FFT alloc fails\n");


    cfg_neon = ne10_fft_alloc_c2c_float32_neon (fftSize);

    if (cfg_neon == NULL)


        NE10_FREE (cfg_c);

        fprintf (stdout, "======ERROR, FFT alloc fails\n");


  ne10_fft_c2c_1d_float32_neon ( (ne10_fft_cpx_float32_t*) out_neon, (ne10_fft_cpx_float32_t*) in_neon, cfg_neon, 0);

  ne10_vmul_vec2f_neon(out_amp_f32, (ne10_vec2f_t *)out_neon, (ne10_vec2f_t *)out_neon, fftSize);

  NE10_FREE (cfg_c);

  NE10_FREE (cfg_neon);



static void my_test_setup (void)


    ne10_log_buffer_ptr = ne10_log_buffer;

    /* init input memory */

    in_c  = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 ) * sizeof (ne10_float32_t));

    in_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES ) * sizeof (ne10_float32_t));

    /* init dst memory */

    out_c = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2) * sizeof (ne10_float32_t));

    out_neon = (ne10_float32_t*) NE10_MALLOC ( (TEST_LENGTH_SAMPLES * 2 ) * sizeof (ne10_float32_t));





void Test_float_1024()


  uint32_t index = 0;

  uint32_t i = 0;

  float *p =out_amp_f32;



     /* calculate peak value*/


       y_out[i] = sqrtf( out_amp_f32[2*i]+out_amp_f32[2*i+1] ) *2/TEST_LENGTH_SAMPLES;



  p =y_out;


  fprintf (stdout, "%4d--%f   %f   %f   %f   %f   %f   %f   %f\n",\









       *(p+7)     );



  index = search_MaxIdx(y_out,TEST_LENGTH_SAMPLES);

  fprintf (stdout, "max point num is %d  = %f\n",index,y_out[index]);




int main (ne10_int32_t argc, char** argv)


  ne10_result_t stat;

  ne10_result_t math_stat;

  ne10_result_t dsp_stat;

  stat = ne10_init();

  if(stat == NE10_OK)

  printf("ne10_init OK!\n");

  math_stat = ne10_init_math (stat);

  if(stat == NE10_OK)

  printf("ne10_init_math OK!\n");

  dsp_stat = ne10_init_dsp (stat);

  if(stat == NE10_OK)

  printf("ne10_init_dsp OK!\n");


  stat = ne10_HasNEON();

  if(stat == NE10_OK)

  printf("cpu with neon!\n");



    return 0;



The test result is put out from teminal:

[root@EmbedSky /mnt]# ./A9_test

ne10_init OK!

ne10_init_math OK!

ne10_init_dsp OK!

cpu with neon!

----------test_fft_c2c_1d_float32_performance start

   0--0.000000   0.000003   2.958737   0.000017         0.000001   0.000005   12.166712   0.000026

   1--0.000002   0.000002   3.526581   0.000015         0.000004   0.000012   17.811594   0.000039

   2--0.000001   0.000006   13.803596   0.000021                0.000002   0.000009   22.367292   0.000042

   3--0.000003   0.000004   6.788651   0.000026         0.000005   0.000028   70.067863   0.000124

   4--0.000003   0.000008   33.308075   0.000018                0.000006   0.000013   38.638416   0.000077

   5--0.000013   0.000014   11.727892   0.000059                0.000022   0.000083   72.452492   0.000262

   6--0.000005   0.000029   67.650848   0.000221                0.000011   0.000082   212.321381   0.000394

   7--0.000030   0.000015   74.159584   0.000180                0.000054   0.000121   357.053802   0.000500

   8--0.000001   0.000003   15.615634   0.000014                0.000002   0.000001   4.653375   0.000025

   9--0.000005   0.000007   7.290600   0.000033         0.000014   0.000050   33.152935   0.000153

  10--0.000003   0.000016   31.989527   0.000065                0.000004   0.000024   68.505203   0.000097

  11--0.000010   0.000015   52.914970   0.000014                0.000018   0.000069   268.941742   0.000185

  12--0.000009   0.000036   55.972534   0.000130                0.000009   0.000040   101.129539   0.000211

  13--0.000016   0.000015   40.044838   0.000113                0.000028   0.000085   240.823975   0.000331

  14--0.000017   0.000058   260.015289   0.000385               0.000046   0.000208   580.652283   0.000964

  15--0.000101   0.000039   234.310699   0.000321               0.000168   0.000232   845.263184   0.000540

  16--0.000002   0.000009   43.014130   0.000061                0.000005   0.000025   65.147423   0.000105

  17--0.000008   0.000002   25.115501   0.000043                0.000015   0.000030   81.902115   0.000142

  18--0.000003   0.000020   48.622169   0.000054                0.000008   0.000025   74.208450   0.000105

  19--0.000019   0.000021   48.614277   0.000050                0.000033   0.000123   247.859283   0.000346

  20--0.000007   0.000040   15.095722   0.000139                0.000001   0.000032   86.764946   0.000143

  21--0.000006   0.000018   68.452156   0.000052                0.000015   0.000070   336.700592   0.000229

  22--0.000012   0.000054   202.304031   0.000192               0.000042   0.000135   390.701294   0.000626

  23--0.000085   0.000059   164.578522   0.000215               0.000136   0.000302   543.785278   0.000656

  24--0.000003   0.000012   81.045288   0.000058                0.000014   0.000039   121.416832   0.000143

  25--0.000034   0.000044   105.488533   0.000096               0.000064   0.000259   550.350098   0.000758

  26--0.000014   0.000036   14.541925   0.000290                0.000015   0.000054   116.387253   0.000232

  27--0.000055   0.000138   299.356995   0.000409               0.000109   0.000758   1779.936035   0.002434

  28--0.000026   0.000074   258.019379   0.000446               0.000049   0.000043   49.417068   0.000408

  29--0.000076   0.000143   239.313828   0.000484               0.000109   0.000691   1362.538452   0.002145

  30--0.000041   0.000058   516.449036   0.000623               0.000059   0.000216   619.675476   0.000774

  31--0.000088   0.000172   520.821411   0.000269               0.000125   0.000645   2337.425049   0.001845

  32--0.000000   0.000003   3.589420   0.000019         0.000000   0.000006   13.843405   0.000028

  33--0.000001   0.000003   10.298284   0.000018                0.000002   0.000019   66.461250   0.000077

  34--0.000000   0.000006   19.241838   0.000023                0.000004   0.000008   21.834381   0.000055

  35--0.000010   0.000015   20.904856   0.000062                0.000017   0.000093   146.907822   0.000297

  36--0.000001   0.000011   40.437374   0.000021                0.000007   0.000013   43.941502   0.000064

  37--0.000015   0.000018   29.845486   0.000058                0.000025   0.000095   120.085327   0.000260

  38--0.000003   0.000036   37.524120   0.000216                0.000013   0.000072   181.527771   0.000368

  39--0.000032   0.000024   39.786701   0.000189                0.000055   0.000155   215.084381   0.000531

  40--0.000001   0.000004   10.670328   0.000016                0.000001   0.000000   5.063481   0.000018

  41--0.000004   0.000006   8.243680   0.000024         0.000011   0.000038   35.816643   0.000103

  42--0.000003   0.000013   24.408234   0.000055                0.000003   0.000020   52.543468   0.000083

  43--0.000006   0.000008   35.756958   0.000019                0.000011   0.000034   181.120407   0.000102

  44--0.000007   0.000025   48.517586   0.000088                0.000008   0.000031   79.276314   0.000164

  45--0.000014   0.000012   23.394804   0.000085                0.000024   0.000065   140.189728   0.000241

  46--0.000012   0.000047   186.442764   0.000310               0.000033   0.000159   435.293610   0.000740

  47--0.000074   0.000024   165.113083   0.000267               0.000125   0.000183   584.355713   0.000478

  48--0.000001   0.000008   28.567337   0.000046                0.000003   0.000018   47.600296   0.000078

  49--0.000007   0.000000   16.499947   0.000032                0.000012   0.000025   52.054745   0.000093

  50--0.000002   0.000015   34.439335   0.000044                0.000006   0.000019   53.052429   0.000079

  51--0.000012   0.000011   28.535830   0.000026                0.000022   0.000066   130.810287   0.000164

  52--0.000005   0.000026   18.895275   0.000086                0.000001   0.000023   61.850578   0.000112

  53--0.000004   0.000007   38.872627   0.000039                0.000009   0.000021   183.325409   0.000079

  54--0.000008   0.000037   131.532867   0.000158               0.000027   0.000096   275.315094   0.000453

  55--0.000056   0.000033   104.326767   0.000165               0.000089   0.000184   326.340057   0.000415

  56--0.000002   0.000010   50.334133   0.000046                0.000008   0.000028   81.778809   0.000112

  57--0.000020   0.000020   54.890594   0.000048                0.000037   0.000126   259.453979   0.000344

  58--0.000007   0.000025   24.627508   0.000158                0.000004   0.000034   79.085579   0.000139

  59--0.000021   0.000059   140.300293   0.000177               0.000043   0.000328   820.769775   0.001047

  60--0.000014   0.000037   122.276161   0.000216               0.000023   0.000031   54.020184   0.000236

  61--0.000034   0.000062   100.214119   0.000234               0.000046   0.000304   602.250610   0.000944

  62--0.000021   0.000007   280.720703   0.000308               0.000033   0.000144   411.743835   0.000576

  63--0.000058   0.000071   271.125824   0.000097               0.000090   0.000225   1149.802368   0.000572

max point num is 254  = 2337.425049


BY use the function genarate_signal(testInput_f32,TEST_FREQ,TEST_LENGTH_SAMPLES),the correct result is 20HZ and peak is 100.

Can anyone find how could i get the correct fft float32 result? Thank you very much!

Prototyping an ARMv8-based design is similar to prototyping any other design.  FPGA prototyping for these types of application is generally used to validate the hardware quickly to head into the software development stage sooner and accelerate the software development. 


Whether you need scalability for your current design as you move through the design and verification process or whether you need your FPGA platform to be reusable and able to scale for future designs that may be larger than your current one, it all starts with identifying and selecting the ideal building blocks. The foundational prototyping board you choose must have flexibility to expand so a custom platform is usually out of the question as a custom board requires even greater customization to grow. When crafting your platform, there are three initial FPGA building blocks to evaluate: Single FPGA boards, Dual FPGA boards, and Quad FPGA boards.


Selecting either a single, dual, or quad board depends on your design’s size, memory requirements, and the number of inter-FPGA connections and external I/Os that will best fit your needs. The chart below provides an example of the differences in these board types based on S2C’s solutions for its Virtex UltraScale Logic Modules.


These comparisons don’t tell the whole story though. You must take a closer look at the architecture for each of these solutions.  Besides the number of physical interconnections between FPGAs, the type (e.g. DDR3, DDR4) and capacity (e.g. 4GB, 8GB) of on-board memory is equally important to your design. Of additional interest should be the number of high-speed gigabit transceivers and their performance level. The diagrams below provide in-depth comparisons of each of the architectures for single, dual, and quad FPGA prototyping boards.


Page 8(1-1)-Diagram of a single FPGA module architecture.jpg


Diagram of a single FPGA module architecture


Page 8(2)-Diagram of a dual FPGA module architecture.jpg


Diagram of a dual FPGA module architecture




Page 9 - Diagram of a quad FPGA module architecture.jpg


Diagram of a quad FPGA module architecture




The type of I/O connectors used in the FPGA module may have a big impact on your design mapping and performance. First, they must be optimized for FPGA I/O banks, and even the FPGA die, in case some FPGAs have multiple internal die. In addition, having I/Os from different die will decrease performance. All traces from the FPGA to the same I/O connector should have the same trace length to increase bus performance. Connector performance itself may also play an important role especially if the connectors are optimized for running high performance LVDS (e.g. over 1GHz).


It's All About Flexibility

The foundational prototyping board is the first step in building scalability. Each solution whether a single, dual, or quad system must allow you to grow, you must be able to have the flexibility to grow your single system into a dual, quad or beyond.  Likewise your dual system should allow you to stitch together other systems of the same FPGA type and architecture to create a quad system. 


Even with this flexibility, there are some implications to the number of interconnects and I/Os when stitching together these systems so careful consideration must be given to which system you initially choose. You will notice in the following diagrams that building these multi-FPGA systems require the ability for the boards to be connected via cables or interconnection modules.  These systems will also need some sort of external module to manage global clocking and reset mechanisms.


Page 10(1) (Updated)- Connection of two single FPGA prototyping module.jpg


Connection of two single FPGA prototyping modules



Page 10(2) (Updated)- Connection of 4 single FPGA prototyping modules.jpg


Connection of 4 single FPGA prototyping modules




Going Beyond 4 FPGAs

What happens if your design needs require going beyond the use of either 4 single FPGAs, 2 dual FPGAs, or a quad FPGA system?  This increase in complexity triggers a whole new set of scalability questions.  These questions can be broken down into several categories.



How big of a desk or lab area do you need to work with a large number of FPGAs?

Although you can continue to stitch together multiple prototyping boards to expand beyond a quad system, your physical lab space may be limited making the connections of these boards much more complicated. Not only will you be dealing with space issues, but also the cabling of these systems will become very unwieldy.


Scalability & Flexibility

What if you require more logic and memory capacity or the system interfaces or memory types change?

Can you configure the large number of FPGA resources for multiple designs?

Because of the investment into large multiple board systems, these reusability type questions become important. It is much easier to invest in single board systems if the expectation is that the board will have limited use beyond the initial design, but when the initial design require the use of a larger prototyping system, your investment must consider possible changes in the prototyping environments and future project uses.


Global System Control

How do you provide low-skew clocks and resets to a large number of FPGAs that you are using for the same design?

Is there a way to easily download to FPGAs remotely and how fast is it?

Lower-end software can provide some sort of support for these questions but may miss some basic requirements. Furthermore, the larger the overall hardware system, the more difficult it is to control such things as clocks and resets. Downloading for larger systems can be a cabling nightmare. Higher-end systems that offer complete runtime support and chassis with minimal cabling help reduce the pain dramatically.


Power Supply

How do you provide power to a large number of FPGAs?

Can each FPGA be individually controlled (On/Off/Recycle)? Is there a power-monitoring feature that you can employ?

Providing power individually to each board can impose even more physical lab space issues not to mention complicating the management of powering each board.



How do you verify that all your clocks and interconnections are correct?

Is there an easy way to monitor the system as well as the individual FPGA statuses?

Making sure a complex prototyping system as large as 32 FPGA works correctly is extremely difficult without automation. If a design isn’t running correctly, a great deal of time can be wasted trying to manually determine if the error is due to the design itself or the FPGA system. Software that provides automated self-test capabilities as well as automated voltage, current, and temperature monitoring with shut down will provide much needed peace of mind.


Seeing FPGA Prototyping for Juno In Action

S2C will be demonstrating their latest Prodigy Juno ARM Interface Module for FPGA prototyping at the upcoming ARM TechCon 2016 so that you can get a close up view of how FPGA prototyping is done for a Juno-based design. 


S2C provides a complete easy set up reference design as part of the Prodigy Juno ARM Interface Module package. It connects S2C Prodigy Virtex UltraScale and Kintex UltraScale Logic Modules with the Juno ARM Development Platform. The reference design shows:


  1. 1) Comprehensive self-testing between the two environments
  2. 2) Expanded FPGA capacity
  3. 3) Early porting of OS kernel or driver code for ARMv8-A processors
  4. 4) High-speed DDR4 memory access between the Logic Module(s) and Juno ARM Development Platform

ARMTechCon 2016 is nearly upon us, and there’s so much high quality technical content in the conference programme and exhibition floor that the only difficulty you’ll have is choosing what to attend at the Santa Clara convention centre! Registrations are still open, and if you need any more convincing then check out this useful guide on how to convince your boss to send you!


For those of you interested in the automotive industry and how ARM-based technology fits in, I’ve pulled together a list of what I think are the conference highlights in this space. Let me know what you’re most looking forward to in the comments section below!








At the ARM booth, #402, you will find a large demo that will show some of the technology and applications we can expect to see in the coming 5-10 years, ARM's idea of how we might be spending our time on the road. Built by ARM's specialist demo team, it addresses the driver experience and what that means for in-vehicle infotainment and driver safety. Come on down and talk to one of the staff nearby, and they will be able to show you applications such as:


  • Autonomous driving mode on the dashboard: Steering wheel retracts and the full dashboard area can be used for apps such as Office, watching movies, music playback
  • Proximity warning: The dash display glows red to indicate the presence of people near the front of the unit
  • Sign recognition: Cameras at the front see and perceive a sign, displaying it on the dashboard UI



Green Hills Software solution demonstrations at ARM TechCon in booth 313, October 26-27, will highlight several of the company’s products and services across several embedded industries using 32-bit and 64-bit platforms based on ARM Cortex-A, Cortex-R and Cortex M. One of the demonstrations Safe & Secure eCockpit Consolidation shows Green Hills’ unique run-time separation architecture that safely and securely executes guest operating systems such as Linux and Android on the same processor as ASIL-certified safety-critical tasks; running concurrently on the same core or across multiple cores, while securely sharing resources such as the GPU.




Technical presentations




Chris Turner (ARM) Developing safe and secure SoCs for automotive, robotics and healthcare Tuesday Oct 25th 10.30am – 11.20am Ballroom E

Cars, robots, medical and other devices rely on ARM technology for continuous safe operation according to guidance given by standards such as IEC 61508 and ISO 26262. Security is equally important for these applications.


This presentation describes how ARM approaches development of processors for such safety-related applications and the hardware and software features for fault detection and control that may be employed by device designers and application engineers. christurner will discuss various processors as a heterogeneous multi-processing system is often required to meet all the performance, efficiency and functional safety requirements for applications such as highly-automated driving.


James Scobie (ARM) Addressing the Challenges of Complex Control in Functionally Safe Applications Tuesday Oct 25th 11.30am – 12.20pm Ballroom H

The ARMv8-R architecture is designed to improve safety, security and reliability in Embedded control systems. This presentation describes the features and configurations offered by ARM Cortex-R processors that enable designers to deliver the ultimate in functional-safety capabilities for automotive and industrial applications.


A microarchitecture is discussed that provides high performance combined with the deterministic execution and responsiveness required for hard real-time applications ranging from industrial controllers and powertrain system through to safety islands and sensor fusion in vision systems. Bare metal virtualization isolates safety and security events, making for lower cost and improved robustness in complex software deployments. Find out more in jscobie's blog New ARM Cortex-R52 enables autonomous systems with the highest functional safety standards



Bernhard Rill (OpenSynergy) The upcoming ARMv8-R architecture perfectly matches the current automotive trends Tuesday Oct 25th 2.30pm – 3.20pm, Ballroom E

The increasing number of functions in vehicles challenges the automotive industry to find solutions that allow the merging of several software systems on one ECU. OpenSynergy (bernhardrill) has answered this with a software architecture based on virtualization technology, ARM architecture and AUTOSAR.

This innovative approach provides:

  • software update,
  • supplementary features,
  • security,
  • mixed ASIL components,
  • reduced BOM costs


The ARMv8-R based architecture even provides hardware support for the real-time multi-AUTOSAR software architectures. It is therefore perfectly positioned to serve the current automotive trend to add functionality and enable overall integration.




Jon Taylor (ARM), Felix Baum (Mentor Graphics Corporation) Hard Real-time Virtualization: how hard can it be? Wednesday Oct 26th 8.30am – 9.20am Ballroom E

The ARMv8-R architecture offers effective virtualization while maintaining the hard real-time response needed to control applications in the industrial, automotive, medical, and military markets. Virtualization enables safety, security, and reliability and it can be the key to successful, cost-effective development and deployment of complex software applications. This session brings together engineers from ARM (jont)and Mentor Graphics to describe how these processors can be applied in next-generation, highly-assisted automotive driving systems. These safety-related applications are kept free from interference by the underlying isolation present in the new ARMv8-R processor architecture.



Rob Bates (Mentor Embedded Device Software Where Safety Meets Security Wednesday Oct 26th 10.30am – 11.20am Ballroom F

Safety has been codified in several industry standards such as ISO 26262 for automotive and IEC 61508 for industrial where software has become a vital part of both the device and ensuring its safety. Security has now become critically important for device manufacturers and their suppliers, including those that supply COTS software.


Existing standards define the lifecycle leading to the creation of safety critical software, but do not say anything directly about security. Cybersecurity, however, is now an important consideration for manufacturers, governmental agencies, and the public at large. Fortunately, there is significant overlap between safety and security software development, and the practices underlying safe software development can be extended to security.


This session discusses the overlap between the two practices, and what to consider when fulfilling governmental and industry recommendations for cybersecurity over and above what is required for safety.



Jay Abraham (MathWorks) Reference workflow for meeting functional safety requirements in automotive systems Wednesday Oct 26th 11.30am – 12.20pm Ballroom H

The functionality and robustness of software is essential for automotive electronics that control powertrain, braking, steering, and driver assistance systems. The development of these systems utilize Model-Based Design and require compliance with ISO 26262 (standard for vehicle functional safety). Model-Based Design enables continuous verification of requirements, software design, and code.


This technical session will explain reference workflows for automotive applications to meet functional safety standards. We will explore various verification activities such as back-to-back equivalence testing to confirm that code compiled for target ARM processors match the software design from a numerical perspective while satisfying execution performance requirements.



Jay Thomas (LDRA) Save Time and Money with ISO 26262 Compliance for Automotive Software Wednesday Oct 26th 2.30pm – 3.20pm Ballroom E

Learn how to demonstrate compliance to the ISO 26262 functional safety standard to provide confidence to OEMs and suppliers. We will also show how the use of standards can help lower costs and development time by identifying and addressing defects during development rather than trying to correct them after deployment. Key to this approach is the use of automated test capabilities for comprehensive software quality assurance.


The presentation will also address the increasing demands for security in automotive software, using automated processes to develop and test high-quality code that identifies potential security vulnerabilities to be addressed early in the development process. The methodology provides a compliance roadmap to help manage the software planning, development, verification, and regulatory activities of ISO 26262 Part 6, Product Development: Software Level (ISO 26262-6).



Greg Davis (Green Hills Software) Designing Reliable Code using MISRA C/C++ Wednesday Oct 26th 3.30pm – 4.20pm Ballroom H

C and C++ are powerful, yet compact programming languages, but they permit programming practices that are not well suited for high reliability systems. MISRA C/C++ is a collection of rules that define a subset of the languages that is less error-prone and more suitable for critical systems, such as in avionics, medical systems, and defense.


This session will provide an introduction to MISRA C/C++, when it should be used, and when it should not. It will also provide an introduction to the most important rules of MISRA and how they help ensure a reliable system.




Shaun Purvis (Hardent Keeping your software simple on today's complex SoCs Thursday Oct 27th 2.30pm - 5.30pm Great America J

For today's system-on-chips (SoCs), having a single, multi-core, high performance embedded processor isn't enough. We now see SoCs combining multiple types of processors, such as a ARM Cortex-A/R combination. These heterogeneous SoCs provide robust computing power, but the increased hardware complexity also complicates software. SoCs built with the latest ARM technology, however, provide additional features that help abstract these complexities from software.

This session will discuss some of these features in detail, and how to take advantage of them to simplify software.


Interconnects and open standards have been a hot topic lately.  A couple of weeks ago, ARM announced the CoreLink CMN-600 Coherent Mesh Network and CoreLink DMC-620 Dynamic Memory Controller IP, which support AMBA 5 CHI, the open standard for high performance coherent on-chip communication. Today, ARM contributed to press announcements from not one, but two new open multi-chip interconnect consortia; CCIX and GenZ.  Each consortium addresses the needs within the data center, which leads to the obvious question:


How do the three different open interconnect standards fit in the data center?


Fundamentally, they are complementary open standards that foster innovation, collaboration and ultimately enable new and emerging use-cases (such as video analytics, machine learning and search acceleration).  Before I dive into the standards and the problems they address, I’d like to highlight the three basic system categories within a data center (SoC, server node and rack) as each have very different interconnect properties and needs.



System on a Chip (SoC)

A multi-processor general purpose compute SoC is the first thing that comes to mind when thinking of data center. However, there are a number of other data center accelerator and IO SoCs, that are used for intelligent networking, storage or specialized off-load tasks and algorithms (ex: FPGAs, GPUs, DSPs).  An SoC interconnect provides connectivity for on-chip processors, accelerator, IO and memory elements.





Server Node

The server node is typically contained within a chassis or blade and will connect a small number of compute, accelerator and IO SoCs .  Today, these SoCs are connected with a simple multichip interconnect (typically PCIe) topology on a PCB motherboard that could also have a small switches and expansion connectors for add-on cards. 






Racks not only house a number of server chassis, but also have top-of-rack switches and a large amount of shared storage.  At the rack scale, the interconnect requires scale-out capabilities with complex topologies, which connect 1000’s of server nodes and storage elements.  Ethernet, Infiniband, Fibre Channel and RapidIO are examples of scale-out interconnects.






AMBA – the standard for on-chip communication

AMBA has now been around for over 20 years and has fostered a rich ecosystem built upon open protocol standards. These standards have enabled IP portability, creation and re-use between different design groups and different vendors.  AMBA 5 CHI Coherent Hub Interface protocol specification was announced in 2013 to enable high performance multi-processor heterogeneous SoCs. It has since been used in numerous server, networking and storage SoCs.  AMBA 5 CHI separated the coherency protocol from the transport which enabled free flowing, high frequencies data transfers over flexible on-chip network topologies, making it well suited for scalable data center SoCs.


The following images illustrates how the on-chip CoreLink CMN-600 interconnect can be used to create custom data center SoCs by connecting various compute, accelerator, IO and memory IP elements.



If you would like to find out more about CHI or other AMBA specifications, please visit the AMBA Protocol developer page.


CCIX - Cache Coherent Interconnect for Accelerators


While AMBA addresses the needs of on-chip communication, a multi-chip standard has much different problems to address ranging from electrical PHYs to mechanical connectors and cables to common software discovery and management.  As noted above, PCIe is the most prevalent server node interconnect and will continue to be widely used, but the lack of coherency is a major drawback.


There are a number of emerging or evolving accelerated use cases such as intelligent network/storage, deep/machine learning, image/video processing and search that are creating demand for more sharing, more bandwidth and lower latency between processors and accelerators.  Hardware cache coherency becomes critical to improving system performance by eliminating software overhead of copying data back and forth and DMA data transfers.  With cache coherency, processors and accelerators can simply make a memory request and the hardware takes care of the rest.


CCIX (pronounced “C6”) provides an open multi-chip coherency standard that allows processors from different vendors with different instruction set architectures and different protocols to extend their cache coherency to remote accelerators.  Now the free flowing, high frequency, AMBA 5 CHI transactions can be converted to CCIX and transferred over flexible multi-chip topologies. To solve the issues introduced with multi-chip connectivity, CCIX has selected PCIe as the first transport.  Leveraging PCIe will dramatically accelerate CCIX deployment and time to market, since it leverages a well-established ecosystem that has already solved the electrical, mechanical, switching and software problems.  It will also simplify the SoC design process by leveraging existing IP and by allowing dual-purpose pins/ports, which can be configured as CCIX or PCIe depending upon which system that are attached within.


For more information about CCIX go to


GenZ – A new approach to data access

Gen Z.png

While CCIX allows processors to extend their cache coherency to off-chip accelerators, GenZ is addressing the need for higher performance data accesses, with an interconnect based on memory operations that addresses both server node and rack scale.  Today, storage requires block based accesses with complex, code intensive software stacks.  Memory operations such as loads and stores allow processors to access both volatile (ie DRAM) and non-volatile storage in the same efficient manner.  Emerging Storage Class Memory (SCM) and rack level disaggregated

memory pools, are example use-cases that benefit from a memory operation interconnect.


Storage Class Memory

There are a number of new, emerging non-volatile memory technologies, that provide latencies much closer to traditional DDR than today’s SSD devices.  This allows server nodes to not only have a local, persistent memory pool, but also allows for much larger addressable memory per node at lower cost per byte than DDR.


Rack-level disaggregated pooled memory

Big Data analytics demands are not only increasing the amount of memory/storage, but also increasing the demand for real-time processing of larger data sets. Disaggregated memory brings a large pool of low latency, volatile and non-volatile memory to the rack scale. Disaggregated memory also significantly helps the TCO (total cost of ownership) for datacenters, as it allows for better dynamic utilization and allocation of these resources, based on the application demands.


For more information about GenZ go to

Meeting the challenges of new workloads through open standards

Open standards foster innovation, collaboration and ultimately provide businesses more flexibility, performance, efficiency and choice in their technology investments.  Hopefully I’ve been able to help answer the question about how these 3 different open interconnect standards complement each other within the data center.


  • AMBA – the standard for on-chip communication enabling IP portability, creation and re-use
  • CCIX – extends the benefits of cache coherency to the multi-chip server node for evolving acceleration and IO use cases
  • GenZ – enables a new data centric computing approach to big data problems with scalable memory pools and resources at both server node and rack level


In short you need all three to address the very complex world of data center architectures, especially as they evolve to meet the challenges of emerging and new workloads.


I’ll be discussing more about these technologies at my upcoming technology talk during ARM TechCon Oct 25-27, 2016.

TechCon dates.png


ARM TechCon 2016 is only a couple of weeks away and is shaping up to be the biggest show in the Valley with more presentations and technical sessions than you can shake a stick at!


In addition to the comprehensive conference program, ARM will be hosting two days of free sessions that will cover the latest developments in its technology.


If you’ve already registered for TechCon you may have already received a link to enable you to pick the technical sessions you’d like to attend. If you haven’t planned your diary yet, check out the info below and sign-up while spaces are available on Wednesday 26 October and Thursday 27 October .


If you haven’t already registered, what are you waiting for? You won’t want to miss it!!



Wednesday 26 October


Title: Developing software on the latest ARMv8-M processors for security solutions for IoT
Time: 10:30am - 11:20am, Location: Grand Ballroom D
Presenters: Joseph Yiu, Senior embedded technology manager, ARM

David Black, senior member of technical staff, Doulos


The session will provide:

  • A deep dive into the features of the ARMv8-M architecture
  • Explanations on how ARMv8-M based processors make it easier to create security solutions and how to write software to make the best use of capabilities offered.


Title: Panel: IoT and security - can next generation microcontrollers provide a security foundation for connected intelligent devices?
Time: 11:30am – 12:20pm Location: Grand Ballroom D
Presenters: Panel of industry experts hosted by Nandan Nayampally, vice president of marketing and strategy, ARM


The panel will discuss:

  • The challenges and solutions of creating systems that offer security protection
  • How security solutions can be included within MCU-based products.


Title: Tools and techniques for developing and debugging IoT security software on ARMv8-M based processors
Time: 2:30pm –3:20pm Location: Grand Ballroom D
Presenters: Stefano Cadario, Keil MDK product manager, ARM
Rolf Segger, founder, Segger


The session will explain:

  • How software developers can develop and debug software for ARMv8-M processors
  • Methodologies that are now enabled for more secure software development and debugging.


Title: Panel: making it easier to create secure IoT systems - a semiconductor industry perspective

Time: 3:30pm- 4:20pm Location: Grand Ballroom D
Presenters: Panel of industry experts hosted by Nandan Nayampally, vice president of marketing and strategy, ARM


The panel will:

  • Discuss the challenges and solutions of creating systems that offer security
  • Explore how security solutions can be included within MCU-based products


Title: Securing the Internet of Things - A guide for Non-security experts
Time: 4:30pm – 5:20pm Location: Grand Ballroom D
Presenters: Rob Coombs, director of security marketing, ARM
Phil Attfield, CEO, Sequitur Labs


The session will:

  • Discuss the range of tools and techniques used to help protect the devices and offer guidance on deciding the appropriate level of security required for a device
  • Cover topics such as communication security, isolation, storage and management of critical assets, and device lifecycle management.


Still haven't signed up yet? Have a look at Michele Riga's blog Learn more about IoT - free educational session on October 26th at ARM TechCon 2016 and book your seat before they're all taken!


Thursday 27 October


Title: Unlock the potential of Cortex-A systems with ARM’s next-generation System IP
Time: 10:30am – 11:20am Location: Grand Ballroom D
Presenter: Jeff Defilippi, senior product manager, ARM


This session will:

  • Discuss the benefits of ARM's 3rd generation coherent backplane IP, the CoreLink CMN-600 and DMC-620, and CCIX, the new open standard for multi-chip coherency
  • Explore how the new IP can be used to optimize designs from a 2-Watt access point to a 100-Watt server.


Title: Looking ahead: BLE and 802.15.4 IoT low power standards
Time: 11:30am – 12:20pm Location: Grand Ballroom D
Presenter: Robert Cragie, senior principal engineer architect, ARM


This session will:

  • Review the main features between 802.15.4 and Bluetooth low energy
  • Review how they are evolving and the resulting requirements for efficient and intelligent software to be able to seamlessly switch between the two standards.


Title: Secure device management with mbed and the IoT Cloud services
Time: 1:30pm – 2:20pm Location: Grand Ballroom D
Presenters: Alon Shamir, director of cloud services, IoT business, ARM


The presentation will:

  • Explain how ARM mbed is addressing the IoT’s diverse array of connected devices
  • Teach about how mbed can drastically reduce operation cost and risk, allowing enterprises to scale further, faster.


Title: Introducing ARM mbed OS 5
Time: 2:30pm-3:20pm Location: Grand Ballroom D
Presenter: Simon Ford, senior director of marketing, IoT business, ARM


The presentation will:

  • Provide an update on the latest mbed OS 5 release, a platform OS designed for Cortex-M microcontrollers, including updates on the new RTOS kernel, connectivity standards and integrated security and services of the platform.
  • Share a view of the diversity of hardware supported and what it takes to bring new platforms to mbed OS.



At the Linley Processor Conference earlier this week, I had the opportunity to present the challenges facing architects who are building hardware for distributed cloud intelligence. I also

discussed how you can address these challenges with ARM’s 3rd generation coherent backplane IP; the ARM CoreLink CMN-600 and ARM CoreLink DMC-620. The new on-chip network and memory controller IP has been optimized to boost SoC performance across a broad range of applications and markets including; networking, server, storage, HPC, automotive and industrial.


The need for an intelligent flexible cloud

Not only are we seeing a significant growth in the number of connected devices, but we are also seeing evolving use cases. Virtual reality is hitting the mainstream price points requiring a constant high bandwidth stream of content. Autonomous vehicles are catching a lot of buzz, but we probably will not see truly autonomous vehicles on our streets until ultra-low latency car-to-car communication is deployed.  These new use cases will require an intelligent flexible cloud where the applications and services are pushed to the edge of the network.


Blending compute and acceleration from edge to cloud

A new approach will be required to meet the demands of these evolving use-cases.  Today system architects are trying to figure out how to maximize efficiency with heterogeneous computing and acceleration (ex: GPU, DSP, FPGA), to optimize systems across a wide range of power and space constraints.  During the presentation, I showed three different example design points, each with different needs and constraints.  The data center maximizing compute density for a wide variety of workloads, the edge cloud to provide distributed services and the small access point to keep all the end points connected at all times.



New high performance, scalable architecture

These three heterogeneous design points illustrate the targets we set out to address with our 3rd generation coherent backplane IP architecture. Our goal was to maximize compute performance and throughput (a measure of both bandwidth and number of transactions), across a broad range of power and area constraints.


The result is our new CoreLink CMN-600 Coherent Mesh Network and CoreLink DMC-620 Dynamic Memory Controller.  Together they have been optimized to provide a fast, reliable on-chip connectivity and memory subsystem for heterogeneous SoCs that blend ARMv8-A processors, accelerators and IO.


Some of the key new capabilities and performance metrics include:

  • New scalable mesh network that can be tailored for SoCs from 1 to 32 clusters (up to 128 processors)
  • 5x higher throughput than the prior generation and capable of more than 1TeraByte/s of sustained bandwidth
  • Higher frequencies (exceeding 2.5 GHz) and 50 percent lower latency
  • New Agile System Cache with intelligent cache allocation to enhance sharing of data between processors, accelerators and IO
  • Supporting CCIX, the open industry standard for coherent multi-chip processor and accelerator connectivity
  • 1 to 8 channels of DDR4-3200 memory and 3D stacked DRAM for up to 1TeraByte of addressable memory per channel
  • End-to-end QoS and RAS (Reliability, Availability and Serviceability) supported by the combined CMN-600 and DMC-620 solution
  • In-built security with integrated ARM TrustZone Address Space memory protection
  • Automated SoC creation with ARM CoreLink Creator and Socrates DE tooling


The following image illustrates how the technology could be used to build a small access point, focused on throughput with efficiency up to the data center, focused on maximizing compute density.



We are really excited to see the continued evolution of these new intelligent, distributed use-cases and we are excited to see how SoC architects will deploy our new technology. Stay tuned as we’ll be continuing to discuss more about the capabilities in the coming months.


If you would like to find out more about the IP, please check out our developer pages below or attend my upcoming technical talk at ARM TechCon, Oct 25-27 2016 in Santa Clara, CA.



ARM processor

Posted by zoezz Sep 22, 2016

An ARM processor is one of a family of CPUs based on the RISC (reduced instruction set computer) architecture developed by Advanced RISC Machines (ARM).



ARM makes 32-bit and 64-bit RISC multi-core processors. RISC processors are designed to perform a smaller number of types of computer instructions so that they can operate at a higher speed, performing more millions of instructions per second (MIPS).  By stripping out unneeded instructions and optimizing pathways, RISC processors provide outstanding performance at a fraction of the power demand of CISC (complex instruction set computing) devices.



ARM processors are extensively used in consumer electronic devices such as smartphones, tablets, multimedia players and other mobile devices, such as wearables. Because of their reduced instruction set, they require fewer transistors, which enables a smaller die size for the integrated circuitry (IC). The ARM processor’s smaller size, reduced complexity and lower power consumption makes them suitable for increasingly miniaturized devices.



ARM processor features include:



Load/store architecture.

An orthogonal instruction set.

Mostly single-cycle execution.

Enhanced power-saving design.

64 and 32-bit execution states for scalable high performance.

Hardware virtualization support.

The simplified design of ARM processors enables more efficient multi-core processing and easier coding for developers. While they don't have the same raw compute throughput as the products of x86 market leader Intel, ARM processors sometimes exceed the performance of Intel processors for applications that exist on both architectures.



The head-to-head competition between the vendors is increasing as ARM is finding its way into full size notebooks.  Microsoft, for example, offers ARM-based versions of Surface computers. The cleaner code base of Windows RT versus x86 versions may be also partially responsible -- Windows RT is more streamlined because it doesn’t have to support a number of legacy hardwares.



ARM is also moving into the server market,  a move that represents a large change in direction and a hedging of bets on performance-per-watt over raw compute power. AMD offers 8-core versions of ARM processors for its Opteron series of processors. ARM servers represent an important shift in server-based computing. A traditional x86-class server with 12, 16, 24 or more cores increases performance by scaling up the speed and sophistication of each processor, using brute force speed and power to handle demanding computing workloads.



In comparison, an ARM server uses perhaps hundreds of smaller, less sophisticated, low-power processors that share processing tasks among that large number instead of just a few higher-capacity processors. This approach is sometimes referred to as “scaling out,” in contrast with the “scaling up” of x86-based servers.



The ARM architecture was originally developed by Acorn Computers in the 1980s.

Across multiple markets, electronic systems are becoming more complex - including automotive, industrial control and healthcare. Vehicles are beginning to drive themselves, industrial robots are becoming increasingly collaborative, and medical systems are automated to assist with surgery or deliver medication. More of these systems are demanding functionally safe operation and requiring that functional safety be provided at a higher safety level than previous generations of systems demanded. The new ARM® Cortex®-R52 processor has been introduced to addresses the challenging needs of these types of system.

cortex r-52.png

This rise in complexity can be demonstrated in vehicles, where the car compute is expected to rise by 100 times by 2020. For example, engine management systems continue to increase in complexity to meet ever more stringent emission controls and must safely control the engine to prevent damage or hazards like unintended acceleration.  Vehicle electrification requires control of very powerful motors and sophisticated management of batteries with a huge amount of stored energy – the large 90kWh lithium ion battery pack in a Tesla contains the equivalent amount of energy as 77kg of TNT explosive - so the consequences of errors are significant. On the industrial side, factory automation is increasing with autonomous robotics using machine learning and vision systems to enable them to work more flexibly and with less direct control.


Outside the factory, robotics will be used in environments too harsh for humans, such as the nuclear industry, where there is a need to maintain precise and assured operation. They can also be used in the medical operating theaters with remote surgery. In both areas functionally safe operation is critical.


Functional safety

It’s obvious that a car’s brakes need to work exactly when required in order to drive safely. Systems such as these require functional safety. Hazards or errors may occur however; hence a functionally safe system must be capable of detecting these to avoid unsafe situations.


A functionally safe system has to be protected against two types of errors: random or systematic.

Kite Safety 101.PNG

The impact of random errors, for example a memory bit flipping due to radiation, can be protected against through the inclusion of features in the processor. Cortex-R52 integrates the highest level of safety features of any ARM processor to guard against this type of error.

Kite Safety Summary.PNG


Systematic errors on the other hand are typically as a result of software or design errors. Protection against these is provided by the use of appropriate processes and procedures at design. Cortex-R52 has been developed from the ground up within a robust process to help protect it from these systematic issues. A comprehensive safety pack is available to SoC partners which simplifies and reduces the effort needed in certifying the end system.


There are a number of different standards and guidelines related to functional safety. As an example, ISO 26262 was developed for the automotive industry in which four Automotive Safety Integrity Levels (ASIL) are defined, of which D is the highest level.


You can read more about functional safety in The Functional Safety Imperative in Automotive Design whitepaper .

The rise of autonomous systems

There are a range of different applications where functional safety  and fast deterministic execution is necessary. In many real time control systems the application can be managed either with a single Cortex-R52 processor or across multiple homogeneous processors. This might be typical in a conventional control systems like an automotive engine management system or industrial controller.


As mentioned, more and more systems are moving towards autonomous behaviour.  We can divide the functions found in an autonomous system in to a set of stages: sense, perceive, decide, actuate.



  • Sense: a broad range of sensors are used to gather raw information
  • Perceive: data from the sensors is used along with complex algorithms such as machine learning to interpret more about the environment in which the system is operating
  • Decide: the outputs from the various systems are gathered and a decision made
  • Actuate: the decision is carried out or communicated


ARM enables all aspects of these autonomous systems with processors from across the Cortex-A, Cortex-R and Cortex-M families being used according to the need of each stage. The decide and actuate stages must be functionally safe. As an example, the decision stage can take inputs from the navigation system, speed sensors and all of the vision and radar systems and decide when to change lane or to get ready to exit the highway.


Automotive is a prime example of the move to autonomous systems.  We are already seeing driver assistance systems such as lane detection, where the driver is notified, moving to lane keeping where action is taken. Vehicles are introducing functionality on the way to autonomy such as automatic lane changing, that only experimental had previously supported.


The trend is also being seen in other areas. Conventional robotic production lines, where robots carry out a defined fixed task and are segregated from operators, are being replaced by collaborative industrial robots. These have unconstrained interaction with human operators, sensing their environment and taking action safely.  They may be capable of selecting and placing the correct component while working in conjunction with a human operator on the same assembly and avoiding a hazardous conflict. Surgical robots are also increasingly being used to help provide improved patient outcomes and future commercial autonomous drones are expected to be in need of these characteristics.

Autonomous system.PNG

As with the previous real time control system there is a need to take inputs from sensors, decide what to do and then command action.


These autonomous systems need to apply another level of judgement by interpreting more about the environment in which they are operating. These tasks can be confidence based and require high levels of throughput to process large amounts of data. Such operations are well suited to Cortex-A class of processors.


These systems still need to be functionally safe with deterministic execution. When combined together in a heterogeneous processor, the Cortex-R52 can provide a safety island protecting the operation of the system.


In the case of an ADAS system, inputs can be gathered from sensors such as cameras, Radar and Lidar. This data is processed and combined by the Cortex-A processors to identify and classify targets.  This information can be passed to the Cortex-R52 to decide what action to take and perform the necessary checks on the operation to ensure safe operation.


Increasing software complexity

As the functionality of a system has evolved, the complexity of both hardware and software has also increased. Systems are now integrating more software from multiple sources and with multiple safety criticality needs. This is a complex integration challenge.


Safety critical software needs to be validated and certified; a time consuming and complex exercise. Because of the interaction between the software, the entire software stack would typically be safety certified, even if only a small proportion is safety critical. The more complex the system, the harder this becomes.

Kite SW complexity.PNG

A better solution would be the ability to guarantee the independence of safety critical code.  This would simplify the development and integration of functional safety software,  with clear separation between different levels of software criticality. Safety code, critical safety code and non-safety code can each be validated and certified to their required level. Providing this independence means that changes to one module do not require wholesale re-certification of all of the software, thus saving time and effort.


For many of these systems it is important to remember that this separation must be achieved whilst still maintaining deterministic execution.


Cortex-R52 is unique in providing the hardware to support both isolation and real-time execution, and this is achieved through the addition of a new exception level and 2-stage MPU, introduced in the ARMv8-R architecture. This can be used by monitor or hypervisor software to manage access to resources and create sandboxes to protect each task. The design of the Cortex-R52 allows for fast switching between protected applications and maintains deterministic execution.


At the same time as offering protection of software it also simplifies the integration of code together into a single processor. Through the use of a hypervisor, multiple operating systems can be supported more easily, thus enabling consolidation of applications.


Delivering real time performance

Many of these systems I described above require deterministic operation, with the appropriate action being not only controlled but also performed at the right time and without significant delay, regardless of what else is happening in the system.


The Cortex-R family offers real-time processors with high-performance for embedded systems. Cortex-R52 is the first processor in the ARMv8-R architecture and further extends the capabilities of the Cortex-R5, both in terms of functional safety and increased performance.


Cortex-R52 delivers up to 35% higher single core performance over Cortex-R5, when running standard benchmarks. EEMBC has independently certified and published the results of their Automotive Industrial benchmark confirming the processor’s increased capability. Results were achieved using the Green Hills Compiler 2017.


This benchmark performance increase is enhanced by additional real time performance gains. Through fast access and integration of the interrupt controller within the cluster, interrupt latency has been reduced to half that of the Cortex-R5. The improved Memory Protection Unit, with finer granularity and faster reconfiguration, significantly reduces context switching time, to 14 times faster than the Cortex-R5. Compared to the Cortex-R5, system performance is further increased as twice as many Cortex-R52's can integrated within a cluster.


Cortex-R52 supports an adaptable memory architecture with deterministic Tightly Coupled Memories integrated within the processor. These enable assured memory latencies and they can be flexibly allocated to Instruction or Data and configured in a range of sizes to meet the application needs. The processor supports a rich set of interface ports around which the system can be built. Interfaces include a Low Latency Peripheral Port, AXI interfaces and a dedicated wide Flash memory interface to provide access to resources with managed arbitration.


Leveraging the power of ARM

The adoption of Cortex-R52 comes with a lot more than just the processor. The ARM architecture has amassed a broad following of adopters and developers within its ecosystem. With silicon partners delivering hardware to the market, it’s the number one architecture with, at the time of writing, more than 86 billion chips shipped.


Ecosystem partners provide the widest choice of software packages, drivers, stacks, while operating systems and tools - simplifying development for users. Adopters of the Cortex-R52 can leverage this common architecture to reduce costs through availability of multiple suppliers capable of addressing their requirements with the architecture. They can develop on a single platform and implement heterogeneous systems and port solutions between different platforms faster and with more reliable results. For more information check out ARM's software development tools for ARM Cortex-R.

ARM EcoSystem.PNG

Cortex-R52 addresses increased sophistication in safety applications

A high level of deterministic functional safety is needed in automotive, industrial, aerospace and medical markets (amongst others) where there is the need to devolve more autonomy in electronic systems. The Cortex-R52 processor has been designed to address the trend of increasing sophistication in safety applications which are driving a need for higher levels of performance, greater support for functional safety and an improved approach to software separation.

In concert with ARM's rollout today of the new ARM Cortex-R52, the first ARMv8-R processor, Synopsys also announced a broad set of design solutions support to enable design of safety-critical systems for automotive, industrial and healthcare applications with this new processor.


jscobie wrote a good blog that explains how the new ARM processor supports development of safety-critical applications: New ARM Cortex-R52 enables autonomous systems with the highest functional safety standards


Designers can start designing Cortex-R52 designs today using Synopsys solutions, including:

In addition to following the links above to Synopsys solutions for Cortex-R52, you can learn more about Synopsys' automotive IC design and software development solutions, which are enabling safe, secure, smarter cars -- from silicon to software at

The Cortex M7 has twice the DSP power of the M4 by executing twice as many instructions simultaneously, and it also helps that the M7 can operate at a higher clock frequency than the M4. It’s backed by the Keil CMSIS DSP library and includes a single and double precision FPU.




It was developed to provide a low-cost platform that meets the needs of MCU implementation, with a reduced pin count and low-power consumption, while delivering outstanding computational performance and low interrupt latency. You can also use two M7 cores in lock step running the same code – one following two cycles behind the other – so that glitches can be detected by external electronics if the two CPUs sudden behave slightly differently.

Setting up Keil for Your First LED Blinking Program on STM32F7 Discovery Board – KGP Talkie


The STM32F745xx and STM32F746xx devices are based on the high-performance ARM®Cortex®-M7 32-bit RISC core operating at up to 216 MHz frequency. The Cortex®-M7 core features a single floating point unit (SFPU) precision which supports all ARM®single-precision data-processing instructions and data types. It also implements a full set of DSP instructions and a memory protection unit (MPU) which enhances the application security.

If you have been tracking ARM in servers and networking news closely, you will know it has been a busy summer. Most recently at Hot Chips, ARM Fellow and Lead ISA Architect Nigel Stephens disclosed details on our ARMv8-A SVE technology. While Nigel’s technology disclosure was primarily targeted to the HPC community, our next disclosure will have a much broader impact for the ARM server and networking ecosystems.


At the upcoming Linley Processor Conference, ARM Senior Product Manager Jeff Defilippi will introduce the next-generation of ARM coherent backplane IP designed to boost SoC performance in systems based on the ARMv8-A architecture from the edge of the network and into the Cloud. See below for the full description of Jeff's session:




















If you are a member of the press and industry analyst community and would like more information ahead of the conference, please contact


Where: Linley Processor Conference 2016 at the Hyatt Regency, Santa Clara, CA

When: Sept. 27, 1:50 p.m. (Session 5 on the day one agenda entitled SoC Connectivity)


What's next for headsets?

Posted by lorenser Sep 12, 2016

The cat is out of the bag. There has been a lot of speculation around Apple’s plans to remove the headset jack for the iPhone 7. The recent announcement confirming this will now lead to innovation and new opportunities in the headset market. This will be driven by user’s demand for longer listening and talk time for battery-powered headsets and will require scalable platforms to add new features.


Next generation headsets demand scalable solutions


Audio algorithms and codecs cover both encoding and decoding of audio streams, which usually happens in stages. These stages range from MAC intensive modules, such as filters, to modules where control code is dominant. Hence each of these modules has specific system requirements if they are to be efficiently processed.


While the main use case of headsets is audio processing, the human ear is a great source for body diagnostics, too. For example, dedicated sensors in the ear channel could be used to measure heart rate. Adding more sensors into these Bluetooth enabled devices will demand scalable platforms and drive the requirement for even more energy-efficient SoCs.


The ability to process sensor data, control and DSP code in a power and area optimized processor will be essential to enable innovation and consumer excitement. ARM®’s Cortex®-M processors are well positioned to enable scalable platforms to meet current and future requirements. Their ease of use and simple programmer’s model combined with the binary compatibility across the Cortex-M portfolio allow for scalable and future proof systems.


Low-power ARM IP for headset platforms


Cortex-M4 is the ARM’s mainstream Digital Signal Controller and meets the high-performance requirements needed in these battery-powered devices. The highly efficient processing of control code and sensor data is well known in Cortex-M. However, one of the key features of Cortex-M4 is the addition of DSP extensions into the instruction set. This has a number of advantages:


  1. cost savings  - as it enables the integration of a single core instead of two cores
  2. reduced system complexity - by removing the need for shared memory and reducing software development costs


Hence Cortex-M4 is extensively used in audio applications including keyword spotting for voice-activated devices, audio encoding and decoding for phone calls or music playback. It is supported by a rich set of  voice and audio codecs that have been ported to Cortex-M4 including codecs from Adaptive Digital, Alango Technologies, Fraunhofer IIS, Ittiam and Picustech Software.


To make development of wireless systems even easier, the Cortex-M4 is a great combination with ARM’s sub-1V Cordio® radio IP for Bluetooth low-energy applications.


Watch out for my next blog about more information on the signal processing capabilities of Cortex-M4 and Cortex-M7.


See also: Could removing the headphone jack mark the start of the Bluetooth low energy audio accessories market?



Today at Hot Chips in Cupertino, I had the opportunity to present the latest update to our ARMv8-A architecture, known as the Scalable Vector Extension or SVE. Before going into the technical details, key points about ARMv8-A SVE are:


  • ARM is significantly extending the vector processing capabilities associated with AArch64 (64-bit) execution in the ARM architecture, now and into the future, enabling implementation choices for vector lengths that scale from 128 to 2048 bits.

  • High Performance Scientific Compute provides an excellent focus for the introduction of this technology and its associated ecosystem development.

  • SVE features will enable advanced vectorizing compilers to extract more fine-grain parallelism from existing code and so reduce software deployment effort.


I’ll first provide some historical context. ARMv7 Advanced SIMD (aka the ARM NEON instructions) is ~12 years old, a technology originally intended to accelerate media processing tasks on the main processor. It operated on well-conditioned data in memory with fixed-point and single-precision floating-point elements in sixteen 128-bit vector registers.  With the move to AArch64, NEON gained full IEEE double-precision float, 64-bit integer operations, and grew the register file to thirty-two 128-bit vector registers. These evolutionary changes made NEON a better compiler target for general-purpose compute.  SVE is a complementary extension that does not replace NEON, and was developed specifically for vectorization of HPC scientific workloads.


Immense amounts of data are being collected today in areas such as meteorology, geology, astronomy, quantum physics, fluid dynamics, and pharmaceutical research.  Exascale computing (the execution of a billion billion floating point operations, or exaFLOPs, per second) is the target that many HPC systems aspire to over the next 5-10 years. In addition, advances in data analytics and areas such as computer vision and machine learning are already increasing the demands for increased parallelization of program execution today and into the future.


Over the years, considerable research has gone into determining how best to extract more data level parallelism from general-purpose programming languages such as C, C++ and Fortran. This has resulted in the inclusion of vectorization features such as gather load & scatter store, per-lane predication, and of course longer vectors.


A key choice to make is the most appropriate vector length, where many factors may influence the decision:


  • Current implementation technology and associated power, performance and area tradeoffs.

  • The specific application program characteristics.

  • The market, which is HPC today; in common with general trends in computer architecture evolution, a growing need for longer vectors is expected in other markets in the future.


Rather than specifying a specific vector length, SVE allows CPU designers to choose the most appropriate vector length for their application and market, from 128 bits up to 2048 bits per vector register.  SVE also supports a vector-length agnostic (VLA) programming model that can adapt to the available vector length.  Adoption of the VLA paradigm allows you to compile or hand-code your program for SVE once, and then run it at different implementation performance points, while avoiding the need to recompile or rewrite it when longer vectors appear in the future.  This reduces deployment costs over the lifetime of the architecture; a program just works and executes wider and faster.


Scientific workloads, mentioned earlier, have traditionally been carefully written to exploit as much data-level parallelism as possible with careful use of OpenMP pragmas and other source code annotations.  It’s therefore relatively straightforward for a compiler to vectorize such code and make good use of a wider vector unit. Supercomputers are also built with the wide, high-bandwidth memory systems necessary to feed a longer vector unit.


However, while HPC is a natural fit for SVE’s longer vectors, it offers an opportunity to improve vectorizing compilers that will be of general benefit over the longer term as other systems scale to support increased data level parallelism.


It is worth noting at this point that Amdahl’s law tells us the theoretical limit of a task’s speedup is governed by the amount of unparallelizable code. If you succeed in vectorizing 10% of your execution and make that code run 4 times faster (e.g. a 256-bit vector allows 4x64b parallel operations), then you've reduced 1000 cycles down to 925 cycles, providing a limited speedup for the power and area cost of the extra gates. Even if you could vectorize 50% of your execution infinitely (unlikely!) you've still only doubled the overall performance. You need to be able to vectorize much more of your program to realize the potential gains from longer vectors.


So SVE also introduces novel features that begin to tackle some of the barriers to compiler vectorization. The general philosophy of SVE is to make it easier for a compiler to opportunistically vectorize code where it would not normally be possible or cost effective to do so.


What are the new features and the benefits of SVE compared to NEON?


Scalable vector length (VL)Increased parallelism while allowing implementation choice of VL
VL agnostic (VLA) programmingSupports a programming paradigm of write-once, run-anywhere scalable vector code
Gather-load & Scatter-storeEnables vectorization of complex data structures with non-linear access patterns
Per-lane predicationEnables vectorization of complex, nested control code containing side effects and avoidance of loop heads and tails (particularly for VLA)
Predicate-driven loop control and managementReduces vectorization overhead relative to scalar code
Vector partitioning and SW managed speculationPermits vectorization of uncounted loops with data-dependent exits
Extended integer and floating-point horizontal reductionsAllows vectorization of more types of reducible loop-carried dependencies
Scalarized intra-vector sub-loopsSupports vectorization of loops containing complex loop-carried dependencies


SVE is targeted at the A64 instruction set only, as a performance enhancement associated with 64-bit computing (known as AArch64 execution in the ARM architecture). A64 is a fixed-length instruction set, where all instructions are encoded in 32 bits. Currently 75% of the A64 encoding space is already allocated, making it a precious resource.  SVE occupies just a quarter of the remaining 25%, in other words one sixteenth of the A64 encoding space, as follows:


The variable length aspect of SVE is managed through predication, meaning that it does not require any encoding space. Care was taken with respect to predicated execution to constrain that aspect of the encoding space.  Load and store instructions are assigned half of the allocated SVE instruction space, limited by careful consideration of addressing modes. Nearly a quarter of this space remains unallocated and available for future expansion.


In summary, SVE opens a new chapter for the ARM architecture in terms of the scale and opportunity for increasing levels of vector processing on ARM processor cores. It is early days for SVE tools and software, and it will take time for SVE compilers and the rest of the SVE software ecosystem to mature. HPC is the current focus and catalyst for this compiler work, and creates development momentum in areas such as Linux distributions and optimized libraries for SVE, as well as in ARM and third party tools and software.


We are already engaging with key members of the ARM partnership, and will now broaden that engagement across the open-source community and wider ARM ecosystem to support development of SVE and the HPC market, enabling a path to efficient Exascale computing.


Stay tuned for more information


Following on from the announcement and the details provided, initial engagement with the open-source community will start with the upstreaming and review of tools support and associated standards.  General specification availability is expected in late 2016/early 2017.


Nigel Stephens is Lead ISA Architect and ARM Fellow

Today we have exciting news: ARM and Intel Custom Foundry have announced an agreement to accelerate the development and implementation of ARM SoCs on Intel’s 10nm process. Specifically, we are making ARM’s Artisan® Physical IP available on the process as part of an ongoing collaboration.


I’m excited about our collaboration with Intel Custom Foundry for several reasons including:

  • The benefits to our partners by expanding the ARM ecosystem to offer more manufacturing choices for premium mobile and consumer SoCs.
  • Intel Custom Foundry will give its customers access to world-class physical IP and ARM implementation solutions.
  • All the major foundries now offer Artisan platforms, further confirming it as the industry standard for physical IP.


Today’s announcement represents what we expect to be a long-term, mutually beneficial partnership with Intel Custom Foundry.


One of the strengths and differentiators of the Artisan platform is the availability of ARM core-optimized IP—what we call ARM POP™ technology. The value of POP technology for an ARM core on the Intel 10nm process is tremendous, as it will allow for quicker knowledge transfer, enabling customers to lower their risk in implementing the most advanced ARM cores on Intel’s leading-edge process technology. Additionally, POP technology enables silicon partners to accelerate the implementation and tape-outs of their ARM-based designs. The initial POP IP will be for two future advanced ARM Cortex-A processor cores designed for mobile computing applications in either ARM big.LITTLE™ or stand-alone configurations.


Today at the Intel Developer Forum (IDF), I had the pleasure of joining Intel Senior Fellow, Mark Bohr and Intel Custom Foundry Vice President Zane Ball’s Technical Insights session to announce our collaboration.  We discussed how the partnership will accelerate design enablement for future devices in the premium mobile market including smartphones and tablets. Read more about Zane’s perspective on our collaboration.


Ecosystem enablement

You probably glanced at the headline and thought “ARM and Intel collaborating…what?” Despite press stories, Intel and ARM have worked together for years to help enable the ecosystem, and this is just the latest milestone in that long-standing relationship. I see it as a natural evolution of the design ecosystem: ARM is a leader in processor and physical design, and  Intel Custom Foundry is a leading integrated device manufacturer. This combination is a win-win for customers.  It reinforces an ARM tenet throughout our 25-year history: To continuously enable choice and innovation inside the ARM ecosystem.


This agreement provides access to another key manufacturing source and expands the EDA and IP ecosystem to ensure interoperability and a shorter on-ramp for early leading-edge process technology.


I’ve enjoyed broad experience in this industry, working in semiconductors, EDA and now IP. I love the relentless competition but I also am wowed by moments of cooperation that redefine the industry landscape. This agreement is one example of that and will deliver immense value to the design ecosystem and ultimately to our partners. ARM is committed to Intel’s success as a world-class custom foundry at 10nm. We stand behind our mutual customers when they make that choice.


Let me know your thoughts in the comments section below!


Related stories:

Power management is important, and has become increasingly complex. Recently, we have created an application note. See details below. Hopefully, you will find it useful.


  • Provides high-level considerations for power management of a big.LITTLE system and helps you avoid some potential issues in your big.LITTLE design.

Intended Audience

  • It is written for hardware System on Chip (SoC) designers implementing power-down and power-up sequences for ARM processors.
  • It assumes that you have SoC design experience and are familiar with ARM products.


This application note focuses on the following processors and highlights important issues when powering up or powering down processor cores and clusters on an SoC.

  • Cortex®-A7.
  • Cortex®-A15.
  • Cortex®-A17.
  • Cortex®-A53.
  • Cortex®-A57.
  • Cortex®-A72.
  • Cortex®-A73.


This application note is organized into the following chapters:

  • Chapter 1 Introduction

Read this chapter for information about the purpose of the application note.

  • Chapter 2 Power-down and power-up considerations

Read this chapter for high-level considerations for powerdown and powerup.

  • Chapter 3 Potential SoC integration issues

Read this chapter for potential issues when implementing power management for processor cores or clusters on a typical SoC.

  • Chapter 4 Hardware considerations

Read this chapter for general advice from the hardware perspective when implementing power-down and power-up sequences for big.LITTLE systems.

Your feedback

If you have any feedback about this document, please feel free to contact me. My email address is

See the attachment for details about the application note. Thanks.

Filter Blog

By date:
By tag:

More Like This