I thought it may interest people to see how quite small pieces of C can generate some rather unnerving workings when converting into the Thumb instruction set. As always, i still desperately in need of a faster method to find the top 32 bits of a 32-bit x32-bit --->64 bit multiply. Bastian Schick currently holds the record having a piece of in-line code that takes just 17 cycles. The keen watcher will know that Bastian optimised the D32PF by unrolling and applying some great code. So here is the C;
void FDCT32(int *buf, int *dest, int offset, int oddBlock, int gb){ int i, s, tmp, es; const int *cptr = dcttab; int a0, a1, a2, a3, a4, a5, a6, a7; int b0, b1, b2, b3, b4, b5, b6, b7; int *d;
/* first pass */ D32FP(0, 1, 5, 1); D32FP(1, 1, 3, 1); D32FP(2, 1, 3, 1); D32FP(3, 1, 2, 1); D32FP(4, 1, 2, 1); D32FP(5, 1, 1, 2); D32FP(6, 1, 1, 2); D32FP(7, 1, 1, 4);
/* second pass */ for (i = 4; i > 0; i--) { a0 = buf[0]; a7 = buf[7]; a3 = buf[3]; a4 = buf[4]; b0 = a0 + a7; b7 = MULSHIFT32(*cptr++, a0 - a7) << 1; b3 = a3 + a4; b4 = MULSHIFT32(*cptr++, a3 - a4) << 3; a0 = b0 + b3; a3 = MULSHIFT32(*cptr, b0 - b3) << 1; a4 = b4 + b7; a7 = MULSHIFT32(*cptr++, b7 - b4) << 1;
a1 = buf[1]; a6 = buf[6]; a2 = buf[2]; a5 = buf[5]; b1 = a1 + a6; b6 = MULSHIFT32(*cptr++, a1 - a6) << 1; b2 = a2 + a5; b5 = MULSHIFT32(*cptr++, a2 - a5) << 1; a1 = b1 + b2; a2 = MULSHIFT32(*cptr, b1 - b2) << 2; a5 = b5 + b6; a6 = MULSHIFT32(*cptr++, b6 - b5) << 2;
b0 = a0 + a1; b1 = MULSHIFT32(COS4_0, a0 - a1) << 1; b2 = a2 + a3; b3 = MULSHIFT32(COS4_0, a3 - a2) << 1; buf[0] = b0; buf[1] = b1; buf[2] = b2 + b3; buf[3] = b3;
b4 = a4 + a5; b5 = MULSHIFT32(COS4_0, a4 - a5) << 1; b6 = a6 + a7; b7 = MULSHIFT32(COS4_0, a7 - a6) << 1; b6 += b7; buf[4] = b4 + b6; buf[5] = b5 + b7; buf[6] = b5 + b6; buf[7] = b7;
buf += 8; } buf -= 32; /* reset */And for comparison, my α version of the same logical steps. You will note that the MULSHIFT32 macro is frequent and takes 17 cycles.
;Second pass of DCT
ldr r0,[r7,#0] ldr r1,[r7,#7] add r5,r0,r1 ;b0 sub r0,r1
pop r1
MULSHIFT32
lsls r0,r0,#1 mov r8,r0 ;b7
;store b0=r5 & b7=r8
ldr r0,[r7,#3] ldr r1,[r7,#4] add r6,r0,r1 ;b3 sub r0,r1 pop r1
lsls r0,r0,#3 mov r9,r0 ;b4
;store b0=r5,b3=r6,b4=r9,b7=r8
sub r0,r5,r6 add r5,r5,r6 ;a0 = r5
ldr r1,[sp,#0]
lsls r0,r0,#1 mov r10,r0 ;a3
;store a0=r5,a3=r10,b4=r9,b7=r8
mov r0,r8 mov r1,r9 add r6,r0,r1 ;a4
sub r0,r1 pop r1
lsls r0,r0,#1 mov r8,r0 ;a7
;store a0=r5,a3=r10,a4=r6,a7=r8
ldr r0,[r7,#1] ldr r1,[r7,#6] add r2,r0,r1 mov r9,r2 ;b1
lsls r0,r0,#1 mov r10,r0 ;b6
;store a0=r5,a3=r10,a4=r6,a7=r8,b1=r9,b6=r11
ldr r0,[r7,#2] ldr r1,[r7,#5] add r2,r0,r1 mov r14,r2 ;b2
lsls r0,r0,#1 mov r12,r0 ;b5
;store a0=r5,a3=r10,a4=r6,a7=r8,b1=r9,b2=r11,b5=r12,b6=r14
mov r2,r9 mov r3,r11 add r9,r2 ;a1
sub r0,r2,r3
lsls r0,r0,#2
mov r11,r0 ;a2 ;store a0=r5,a1=r9,a2=r11,a3=r10,a4=r6,a7=r8,b5=r12,b6=r14
mov r2,r12 add r2,r14
add r5,r2,r6 ;a1
sub r0,r2,r6
lsls r0,r0,#2 mov r6,r0 ;a2
;store a0=r5,a1=r9,a2=r6,a3=r10,a4=r6,a7=r8,b5=r12,b6=r14
mov r1,r9 ;get a1 add r0,r5,r1 ;a0 + a1 str [buf,#0],r0
sub r0,r5,r1 ;a0 - a1
ldr r1,#COS4_0 ;copy for later use, mov r14,1 ;
lsls r0,r0,#1 str [buf,#1],r0
;a2=r5,a1=r9,a4=r6,a5=r5,a6=r4,a7=r8,b0=b0,b1
mov r0,r10 ;get a3 add r0,r6 ;a3 + a2 mov r5,r0 ;store b0 str [buf,#2],r0
mov r0,r10 ;get a3 sub r0,r0,r5 ;a3 - a2
ldr r1,#COS4_0
lsls r0,r0,#1 str [buf,#2],r0 add r0,r6 str [buf,#3],r0
;a4=r6,a5=r5,a6=r7,a7=r8 add r0,r5,r6 mov r9,r0 ;b4
sub r0,r6,r7 ;a4 - a5 mov r1,r14
lsls r5,r0,#1 ;b5
mov r0,r7 add r0,r8 ;b6 mov r8,r0
sub r0,r7
mov r1,r14
lsls r6,r0,#1 ;b7
;b4=r9,b5=r5,b6=r8,b7=r6
str [buf,#7],r6 ;store b7
mov r0,r9 add r0,r8
str [buf,#4],r0 ;store b4
add r0,r5,r6 ;b5 + b7
str [buf,#5],r0 ;store b5
add r5,r8
str [buf,#6],r5 ;store b6To be clear, this isn't perfect working code but rather a 'sketch' to roughly work out how many cycles the DCT takes in an MP3 decoder. I haven't calculated the exact number of cycles but it IS impossible to use a conditional branch back to the top of the loop. The code branches around an unconditional loop. Now I haven't dealt with the loops but with good reason, since I use the stack-pointer as a general purpose register, the plan is to find the base-address of RAM (0x20000000 on the board I have) so that the address can be generated in 3 instructions.The code is mind-bending. I did alter the order of the C slightly so that no more than 8 variables have to be stored in registers at any given time. The reason for that is simple:r0-r4 - Used by MULSHIFT32 but can be used between usages of the macro.r5-r6 - lo-register accumulators/storage of variablesr7 - pointer to the source address of the datar8 - r13 storage for variablesr14 - pointer to DCT workspaceAs a good rule of thumb (ha ha), MANY pieces of code that are developed by academics presume a RISC processor with 16 x 32-bit registers, one of which is the stack-pointer. Well, ARM/Thumb has SP,LR & IP making the person converting to a generic RISC processor needs to use SP & LR. There IS a trick involved so that even if an exception/interrupt takes place, nothing is corrupted.But if anyone out there has a short-cut to finding the top 32-bits of a 64-bit multiply result, please let me know...
I'm using the GNU assembler, Studio 7 & a SAMD21-based SoC. Of course decoder will be 100% assembly language. I have noticed that the DCT, imDCT & Polyphase filter constitute about 90% of decode time... so if I cannot get those three blocks of code fast enough, I won't have wasted so much time. It is also an excellent way to learn the instruction set.Of the twenty three instruction set's I have developed projects on, Thumb is by far the most difficult to optimise. During the development of some other fragments, I worked out a trick that was faster but was also a total paradigm shift so EVERYTHING has to be rewritten. This has happened multiple times. I estimate that I would find a new trick and/or optimisation every day on ALL the other processors I have written (professionally published) games on over 20+ years, with Thumb it's like a trick a month. That means that their is no incentive to finding optimisations BUT the Z80 is still in use after 45 years and the M0 is of 2009 vintage, the M0+ was only introduced in 2012 and people are STILL finding tricks.What I posted isn't debugged hence the term 'sketch' and was written in WIndows Notepad. While it isn't quite right, the number of cycles will not change much. I previously posted the first phase of the DCT which Bastian Schick unrolled and made faster. Since the inner-loop of the second stage is something like 267 instructions, unrolling really won't help much.As always, the key to ensuring that this decoder is fast enough to play 320Kb/S audio largely depends upon that ubiquitous MULSHIFT32 macro which takes 17 cycles.