This discussion has been locked.
You can no longer post new replies to this discussion. If you have a question you can start a new discussion

Reorganising C code to be optimal for Thumb-1 (Cortex M0+) Instruction-Set

I have spent the last few months re-re-re-rewriting the DCT32 portion of my fixed-point MP3 decoder in a manner that avoids the need to use a stack-frame. I'm still not QUITE there but I think it is interesting for people to understand how to write C so that it will compile into efficient Thumb (Cortex M0+) assembly language. I think the term 'convoluted' can reasonably be applied. First, the original C and after that, rewritten C and using r13 (SP) as an additional address register.

I don't think C compilers are able to take advantage of the powerful addressing modes open to SP because the compiler is unable to predict if the code is intended to run using PSP or MSP. As always, the trick is to use every available register and to count every single cycle. Generally these kinds of tricks are used by demo programmers but I can assure you that on every commercial game I wrote, I pulled such stunts to make the impossible just-about-possible.


/* second pass */
for (i = 4; i > 0; i--) {
a0 = buf[0]; a7 = buf[7]; a3 = buf[3]; a4 = buf[4];
b0 = a0 + a7; b7 = MULSHIFT32(*cptr++, a0 - a7) << 1;
b3 = a3 + a4; b4 = MULSHIFT32(*cptr++, a3 - a4) << 3;
a0 = b0 + b3; a3 = MULSHIFT32(*cptr, b0 - b3) << 1;
a4 = b4 + b7; a7 = MULSHIFT32(*cptr++, b7 - b4) << 1;

a1 = buf[1]; a6 = buf[6]; a2 = buf[2]; a5 = buf[5];
b1 = a1 + a6; b6 = MULSHIFT32(*cptr++, a1 - a6) << 1;
b2 = a2 + a5; b5 = MULSHIFT32(*cptr++, a2 - a5) << 1;
a1 = b1 + b2; a2 = MULSHIFT32(*cptr, b1 - b2) << 2;
a5 = b5 + b6; a6 = MULSHIFT32(*cptr++, b6 - b5) << 2;

b0 = a0 + a1; b1 = MULSHIFT32(COS4_0, a0 - a1) << 1;
b2 = a2 + a3; b3 = MULSHIFT32(COS4_0, a3 - a2) << 1;
buf[0] = b0; buf[1] = b1;
buf[2] = b2 + b3; buf[3] = b3;

b4 = a4 + a5; b5 = MULSHIFT32(COS4_0, a4 - a5) << 1;
b6 = a6 + a7; b7 = MULSHIFT32(COS4_0, a7 - a6) << 1;
b6 += b7;
buf[4] = b4 + b6; buf[5] = b5 + b7;
buf[6] = b5 + b6; buf[7] = b7;

buf += 8;
}
buf -= 32; /* reset */






/* second pass */
for (i = 4; i > 0; i--)
{
a0 = buf[0];
a7 = buf[7];
b0 = a0 + a7; //+b0 = r6
b7 = MULSHIFT32(*cptr++, a0 - a7) << 1; //+b7 = r7

a3 = buf[3];
a4 = buf[4];
b3 = a3 + a4; //+b3 = r8
b4 = MULSHIFT32(*cptr++, a3 - a4) << 3; //+b4 = r9

a0 = b0 + b3; //+a0 = r6
a3 = MULSHIFT32(*cptr, b0 - b3) << 1; //+a3 = r8

a4 = b4 + b7; //+a4 = r9
a7 = MULSHIFT32(*cptr++, b7 - b4) << 1; //+a7 = r7

a1 = buf[1];
a6 = buf[6];
b1 = a1 + a6; //+b1 = r10
b6 = MULSHIFT32(*cptr++, a1 - a6) << 1; //+b6 = r11

a2 = buf[2];
a5 = buf[5];
b2 = a2 + a5; //+b2 = r12
b5 = MULSHIFT32(*cptr++, a2 - a5) << 1; //+b5 = r14

a1 = b1 + b2; //+a1 = r10
a2 = MULSHIFT32(*cptr, b1 - b2) << 2; //+a2 = r12

a5 = b5 + b6; //+a5 = r14
a6 = MULSHIFT32(*cptr++, b6 - b5) << 2; //+a6 = r12

;*cptr (r13) no longer needed.

b0 = a0 + a1;
buf[0] = b0;
b1 = MULSHIFT32(COS4_0, a0 - a1) << 1;
buf[1] = b1;

b2 = a2 + a3;
b3 = MULSHIFT32(COS4_0, a3 - a2) << 1;
buf[3] = b3;
buf[2] = b2 + b3;

b4 = a4 + a5;
b5 = MULSHIFT32(COS4_0, a4 - a5) << 1;

b6 = a6 + a7;
b7 = MULSHIFT32(COS4_0, a7 - a6) << 1;
buf[7] = b7;
b6 += b7;
buf[4] = b4 + b6;
buf[5] = b5 + b7;
buf[6] = b5 + b6;

buf += 8;
}
buf -= 32; /* reset */