Looking for alternates for this instruction.
If the goal is to reduce the code size, for the price of just one more comparison in the critical path of the code, the size of the log2Lkup lookup table can be reduced from 256 to just 16 bytes.
So here is the deterministic code that takes about 94 bytes of code space, including the lookup table:
static inline uint32_t CLZ(uint32_t x) { static uint8_t const log2Lkup[] = { 0U, 1U, 2U, 2U, 3U, 3U, 3U, 3U, 4U, 4U, 4U, 4U, 4U, 4U, 4U, 4U }; uint32_t n; if (x >= (1U << 16)) { if (x >= (1U << 24)) { if (x >= (1 << 28)) { n = 28U; } else { n = 24U; } } else { if (x >= (1U << 20)) { n = 20U; } else { n = 16U; } } } else { if (x >= (1U << 8)) { if (x >= (1U << 12)) { n = 12U; } else { n = 8U; } } else { if (x >= (1U << 4)) { n = 4U; } else { n = 0U; } } } return 32U - n - log2Lkup[x >> n]; }
Here is the disassembly of the CLZ function for the Cortex-M0 core generated with IAR EWARM (high level of optimization):
volatile uint32_t y = CLZ(x); 0x272: 0x9800 LDR R0, [SP] if (x >= (1U << 16)) { 0x274: 0x2280 MOVS R2, #128 ; 0x80 0x276: 0x0252 LSLS R2, R2, #9 0x278: 0x4290 CMP R0, R2 0x27a: 0xd310 BCC.N ??main_0 ; 0x29e if (x >= (1U << 24)) { 0x27c: 0x0211 LSLS R1, R2, #8 0x27e: 0x4288 CMP R0, R1 0x280: 0xd306 BCC.N ??main_1 ; 0x290 if (x >= (1 << 28)) { 0x282: 0x0109 LSLS R1, R1, #4 0x284: 0x4288 CMP R0, R1 0x286: 0xd301 BCC.N ??main_2 ; 0x28c n = 28U; 0x288: 0x211c MOVS R1, #28 ; 0x1c 0x28a: 0xe014 B.N ??main_3 ; 0x2b6 n = 24U; ??main_2: 0x28c: 0x2118 MOVS R1, #24 ; 0x18 0x28e: 0xe012 B.N ??main_3 ; 0x2b6 if (x >= (1U << 20)) { ??main_1: 0x290: 0x0909 LSRS R1, R1, #4 0x292: 0x4288 CMP R0, R1 0x294: 0xd301 BCC.N ??main_4 ; 0x29a n = 20U; 0x296: 0x2114 MOVS R1, #20 ; 0x14 0x298: 0xe00d B.N ??main_3 ; 0x2b6 n = 16U; ??main_4: 0x29a: 0x2110 MOVS R1, #16 ; 0x10 0x29c: 0xe00b B.N ??main_3 ; 0x2b6 if (x >= (1U << 8)) { ??main_0: 0x29e: 0x28ff CMP R0, #255 ; 0xff 0x2a0: 0xd904 BLS.N ??main_5 ; 0x2ac if (x >= (1U << 12)) { 0x2a2: 0x0912 LSRS R2, R2, #4 0x2a4: 0x4290 CMP R0, R2 0x2a6: 0xd206 BCS.N ??main_3 ; 0x2b6 n = 8U; 0x2a8: 0x2108 MOVS R1, #8 0x2aa: 0xe004 B.N ??main_3 ; 0x2b6 if (x >= (1U << 4)) { ??main_5: 0x2ac: 0x2810 CMP R0, #16 ; 0x10 0x2ae: 0xd301 BCC.N ??main_6 ; 0x2b4 n = 4U; 0x2b0: 0x2104 MOVS R1, #4 0x2b2: 0xe000 B.N ??main_3 ; 0x2b6 n = 0U; ??main_6: 0x2b4: 0x2100 MOVS R1, #0 ??main_3: 0x2b6: 0x2220 MOVS R2, #32 ; 0x20 0x2b8: 0x1a52 SUBS R2, R2, R1 0x2ba: 0x40c8 LSRS R0, R0, R1 0x2bc: 0xa129 ADR.N R1, ??log2Lkup ; 0x364 0x2be: 0x5c08 LDRB R0, [R1, R0] 0x2c0: 0x1a10 SUBS R0, R2, R0 0x2c2: 0x9000 STR R0, [SP]
By my count, this code executes always in just 15 instructions. The number of clock cycles will be of course slightly bigger, especially for branch instructions that break the instruction pipeline.