I have spent the last few months re-re-re-rewriting the DCT32 portion of my fixed-point MP3 decoder in a manner that avoids the need to use a stack-frame. I'm still not QUITE there but I think it is interesting for people to understand how to write C so that it will compile into efficient Thumb (Cortex M0+) assembly language. I think the term 'convoluted' can reasonably be applied. First, the original C and after that, rewritten C and using r13 (SP) as an additional address register.I don't think C compilers are able to take advantage of the powerful addressing modes open to SP because the compiler is unable to predict if the code is intended to run using PSP or MSP. As always, the trick is to use every available register and to count every single cycle. Generally these kinds of tricks are used by demo programmers but I can assure you that on every commercial game I wrote, I pulled such stunts to make the impossible just-about-possible.
/* second pass */ for (i = 4; i > 0; i--) { a0 = buf[0]; a7 = buf[7]; a3 = buf[3]; a4 = buf[4]; b0 = a0 + a7; b7 = MULSHIFT32(*cptr++, a0 - a7) << 1; b3 = a3 + a4; b4 = MULSHIFT32(*cptr++, a3 - a4) << 3; a0 = b0 + b3; a3 = MULSHIFT32(*cptr, b0 - b3) << 1; a4 = b4 + b7; a7 = MULSHIFT32(*cptr++, b7 - b4) << 1;
a1 = buf[1]; a6 = buf[6]; a2 = buf[2]; a5 = buf[5]; b1 = a1 + a6; b6 = MULSHIFT32(*cptr++, a1 - a6) << 1; b2 = a2 + a5; b5 = MULSHIFT32(*cptr++, a2 - a5) << 1; a1 = b1 + b2; a2 = MULSHIFT32(*cptr, b1 - b2) << 2; a5 = b5 + b6; a6 = MULSHIFT32(*cptr++, b6 - b5) << 2;
b0 = a0 + a1; b1 = MULSHIFT32(COS4_0, a0 - a1) << 1; b2 = a2 + a3; b3 = MULSHIFT32(COS4_0, a3 - a2) << 1; buf[0] = b0; buf[1] = b1; buf[2] = b2 + b3; buf[3] = b3;
b4 = a4 + a5; b5 = MULSHIFT32(COS4_0, a4 - a5) << 1; b6 = a6 + a7; b7 = MULSHIFT32(COS4_0, a7 - a6) << 1; b6 += b7; buf[4] = b4 + b6; buf[5] = b5 + b7; buf[6] = b5 + b6; buf[7] = b7;
buf += 8; } buf -= 32; /* reset */
/* second pass */ for (i = 4; i > 0; i--) { a0 = buf[0]; a7 = buf[7]; b0 = a0 + a7; //+b0 = r6 b7 = MULSHIFT32(*cptr++, a0 - a7) << 1; //+b7 = r7
a3 = buf[3]; a4 = buf[4]; b3 = a3 + a4; //+b3 = r8 b4 = MULSHIFT32(*cptr++, a3 - a4) << 3; //+b4 = r9
a0 = b0 + b3; //+a0 = r6 a3 = MULSHIFT32(*cptr, b0 - b3) << 1; //+a3 = r8
a4 = b4 + b7; //+a4 = r9 a7 = MULSHIFT32(*cptr++, b7 - b4) << 1; //+a7 = r7
a1 = buf[1]; a6 = buf[6]; b1 = a1 + a6; //+b1 = r10 b6 = MULSHIFT32(*cptr++, a1 - a6) << 1; //+b6 = r11
a2 = buf[2]; a5 = buf[5]; b2 = a2 + a5; //+b2 = r12 b5 = MULSHIFT32(*cptr++, a2 - a5) << 1; //+b5 = r14
a1 = b1 + b2; //+a1 = r10 a2 = MULSHIFT32(*cptr, b1 - b2) << 2; //+a2 = r12
a5 = b5 + b6; //+a5 = r14 a6 = MULSHIFT32(*cptr++, b6 - b5) << 2; //+a6 = r12
;*cptr (r13) no longer needed.
b0 = a0 + a1; buf[0] = b0; b1 = MULSHIFT32(COS4_0, a0 - a1) << 1; buf[1] = b1;
b2 = a2 + a3; b3 = MULSHIFT32(COS4_0, a3 - a2) << 1; buf[3] = b3; buf[2] = b2 + b3;
b4 = a4 + a5; b5 = MULSHIFT32(COS4_0, a4 - a5) << 1;
b6 = a6 + a7; b7 = MULSHIFT32(COS4_0, a7 - a6) << 1; buf[7] = b7; b6 += b7; buf[4] = b4 + b6; buf[5] = b5 + b7; buf[6] = b5 + b6;
I doubt that today many programmers even look at the generated code. Sean, I guess you and I are like dinosaurs w.r.t. to assembly. But then, the crocodiles are also dinosaurs and are still there