This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

ARMCC: Bitwise Vs half word access

I was under the impression that the compiler would generate same code for read and write operation on the members of the following structure:

typedef struct S32data_t {
	uint32_t 	a:16;
	uint32_t	b:16;
} S32data_t;

typedef struct S16data_t {
	uint16_t 	a;
	uint16_t	b;
} S16data_t;

The read and write operation that I'm performing are:

S32data_t test32data;
S16data_t test16data;

uint32_t xS32;
uint32_t yS32;

uint16_t xS16;
uint16_t yS16;

void WriteS32(void)
{
	test32data.a = xS32;
	test32data.b = yS32;
}

int ReadS32(void)
{
	xS32 = test32data.a;
	yS32 = test32data.b;

	return xS32;
}

The corresponding assembly file generated by the compiler is as below:

$ armcc.exe --help
Product: ARM Compiler 5.06
Component: ARM Compiler 5.06 update 6 (build 750)
Tool: armcc [4d3637]

$ armcc.exe -O3 --c99 -c test.c -o test.o;
$ fromelf.exe -c test.o -o test.s

    WriteS32
        0x00000000:    e59f00c0    ....    LDR      r0,[pc,#192] ; [0xc8] = 0
        0x00000004:    e1d020bc    . ..    LDRH     r2,[r0,#0xc]
        0x00000008:    e2801004    ....    ADD      r1,r0,#4
        0x0000000c:    e1c120b0    . ..    STRH     r2,[r1,#0]
        0x00000010:    e1d001b0    ....    LDRH     r0,[r0,#0x10]
        0x00000014:    e1c100b2    ....    STRH     r0,[r1,#2]
        0x00000018:    e12fff1e    ../.    BX       lr
    ReadS32
        0x0000001c:    e59f00a8    ....    LDR      r0,[pc,#168] ; [0xcc] = 0x4
        0x00000020:    e59f20a0    . ..    LDR      r2,[pc,#160] ; [0xc8] = 0
        0x00000024:    e5901000    ....    LDR      r1,[r0,#0]
        0x00000028:    e282200c    . ..    ADD      r2,r2,#0xc
        0x0000002c:    e1a00801    ....    LSL      r0,r1,#16
        0x00000030:    e1a00820     ...    LSR      r0,r0,#16
        0x00000034:    e1a01821    !...    LSR      r1,r1,#16
        0x00000038:    e8820003    ....    STM      r2,{r0,r1}
        0x0000003c:    e12fff1e    ../.    BX       lr
    WriteS16
        0x00000040:    e59f0080    ....    LDR      r0,[pc,#128] ; [0xc8] = 0
        0x00000044:    e1d020b0    . ..    LDRH     r2,[r0,#0]
        0x00000048:    e2801008    ....    ADD      r1,r0,#8
        0x0000004c:    e1c120b0    . ..    STRH     r2,[r1,#0]
        0x00000050:    e1d000b2    ....    LDRH     r0,[r0,#2]
        0x00000054:    e1c100b2    ....    STRH     r0,[r1,#2]
        0x00000058:    e12fff1e    ../.    BX       lr
    ReadS16
        0x0000005c:    e59f106c    l...    LDR      r1,[pc,#108] ; [0xd0] = 0x8
        0x00000060:    e1d100b0    ....    LDRH     r0,[r1,#0]
        0x00000064:    e2412008    . A.    SUB      r2,r1,#8
        0x00000068:    e1c200b0    ....    STRH     r0,[r2,#0]
        0x0000006c:    e1d110b2    ....    LDRH     r1,[r1,#2]
        0x00000070:    e1c210b2    ....    STRH     r1,[r2,#2]
        0x00000074:    e12fff1e    ../.    BX       lr

If you'll notice, you'll see that the assembler has generated exactly the same code for Write operation but quite different for Read operation. I tried this same experiment with gcc on a x86_64 Linux machine and there I see that the assembler generated exactly the same code for both Write and Read operation. snip below:

$ uname -m
x86_64

$ gcc --version
gcc (Ubuntu 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609

$ gcc -c -O3 -g test.c -o test.o
$ objdump -d test.c

0000000000000000 <WriteS32>:
   0:	8b 05 00 00 00 00    	mov    0x0(%rip),%eax        # 6 <WriteS32+0x6>
   6:	66 89 05 00 00 00 00 	mov    %ax,0x0(%rip)        # d <WriteS32+0xd>
   d:	8b 05 00 00 00 00    	mov    0x0(%rip),%eax        # 13 <WriteS32+0x13>
  13:	66 89 05 00 00 00 00 	mov    %ax,0x0(%rip)        # 1a <WriteS32+0x1a>
  1a:	c3                   	retq   
  1b:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)

0000000000000020 <ReadS32>:
  20:	0f b7 05 00 00 00 00 	movzwl 0x0(%rip),%eax        # 27 <ReadS32+0x7>
  27:	0f b7 15 00 00 00 00 	movzwl 0x0(%rip),%edx        # 2e <ReadS32+0xe>
  2e:	89 05 00 00 00 00    	mov    %eax,0x0(%rip)        # 34 <ReadS32+0x14>
  34:	89 15 00 00 00 00    	mov    %edx,0x0(%rip)        # 3a <ReadS32+0x1a>
  3a:	c3                   	retq   
  3b:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)

0000000000000040 <WriteS16>:
  40:	0f b7 05 00 00 00 00 	movzwl 0x0(%rip),%eax        # 47 <WriteS16+0x7>
  47:	66 89 05 00 00 00 00 	mov    %ax,0x0(%rip)        # 4e <WriteS16+0xe>
  4e:	0f b7 05 00 00 00 00 	movzwl 0x0(%rip),%eax        # 55 <WriteS16+0x15>
  55:	66 89 05 00 00 00 00 	mov    %ax,0x0(%rip)        # 5c <WriteS16+0x1c>
  5c:	c3                   	retq   
  5d:	0f 1f 00             	nopl   (%rax)

0000000000000060 <ReadS16>:
  60:	0f b7 05 00 00 00 00 	movzwl 0x0(%rip),%eax        # 67 <ReadS16+0x7>
  67:	0f b7 15 00 00 00 00 	movzwl 0x0(%rip),%edx        # 6e <ReadS16+0xe>
  6e:	66 89 05 00 00 00 00 	mov    %ax,0x0(%rip)        # 75 <ReadS16+0x15>
  75:	66 89 15 00 00 00 00 	mov    %dx,0x0(%rip)        # 7c <ReadS16+0x1c>
  7c:	c3                   	retq   
  7d:	0f 1f 00             	nopl   (%rax)

Hence, Please let me know, why such behavior and Is it somehow possible to enforce the ARMCC compiler to generate exactly the same code for both cases (Bitwise and half word read and writes) 

Parents
  • Hello, your implementations of ReadS16 and WriteS16 were not given above, but I assume they were of the form:

    void WriteS16(void){
    test16data.a = xS16;
    test16data.b = yS16;}

    int ReadS16(void){
    xS16 = test16data.a;
    yS16 = test16data.b;
    return xS16;}


    That being said, it is really the structure definitions that cause the difference - they are similar, but not identical.

    Understanding the generated code explains the subtle difference.

    For your ReadS32 implementation, we must read (LDR) a single 32-bit value:

    0x00000024: e5901000 .... LDR r1,[r0,#0]

    then use shift instructions to separate a and b bitfields from this value
    (modern CPUs will use a UXTH instruction to do the same more efficiently, try adding "--cpu Cortex-A9" to your compiler command)

    For ReadS16, we can read each 16-bit data value directly with individual LDRH (load halfword) instructions.

    Depending on the nature of this code, one or other may be appropriate. Likely if bitfields are used, this is referring to a peripheral register on the target. Reading this register may be sensitive to sub-word accesses.

    Hence, the Arm compiler is correct to generate different code for each implementation.

    Hope this helps explains the behavior, Ronan

Reply
  • Hello, your implementations of ReadS16 and WriteS16 were not given above, but I assume they were of the form:

    void WriteS16(void){
    test16data.a = xS16;
    test16data.b = yS16;}

    int ReadS16(void){
    xS16 = test16data.a;
    yS16 = test16data.b;
    return xS16;}


    That being said, it is really the structure definitions that cause the difference - they are similar, but not identical.

    Understanding the generated code explains the subtle difference.

    For your ReadS32 implementation, we must read (LDR) a single 32-bit value:

    0x00000024: e5901000 .... LDR r1,[r0,#0]

    then use shift instructions to separate a and b bitfields from this value
    (modern CPUs will use a UXTH instruction to do the same more efficiently, try adding "--cpu Cortex-A9" to your compiler command)

    For ReadS16, we can read each 16-bit data value directly with individual LDRH (load halfword) instructions.

    Depending on the nature of this code, one or other may be appropriate. Likely if bitfields are used, this is referring to a peripheral register on the target. Reading this register may be sensitive to sub-word accesses.

    Hence, the Arm compiler is correct to generate different code for each implementation.

    Hope this helps explains the behavior, Ronan

Children