How can CLZ equivalent be achieved on Cortex-M0 where this instruction is missing?

Looking for alternates for this instruction.

Parents
  • If the goal is to reduce the code size, for the price of just one more comparison in the critical path of the code, the size of the log2Lkup lookup table can be reduced from 256 to just 16 bytes.

    So here is the deterministic code that takes about 94 bytes of code space, including the lookup table:

    static inline uint32_t CLZ(uint32_t x) {
        static uint8_t const log2Lkup[] = {
            0U, 1U, 2U, 2U, 3U, 3U, 3U, 3U, 4U, 4U, 4U, 4U, 4U, 4U, 4U, 4U
        };
        uint32_t n;
        if (x >= (1U << 16)) {
            if (x >= (1U << 24)) {
                if (x >= (1 << 28)) {
                    n = 28U;
                }
                else {
                    n = 24U;
                }
            }
            else {
                if (x >= (1U << 20)) {
                    n = 20U;
                }
                else {
                    n = 16U;
                }
            }
        }
        else {
            if (x >= (1U << 8)) {
                if (x >= (1U << 12)) {
                    n = 12U;
                }
                else {
                    n = 8U;
                }
            }
            else {
                if (x >= (1U << 4)) {
                    n = 4U;
                }
                else {
                    n = 0U;
                }
            }
        }
        return 32U - n - log2Lkup[x >> n];
    }
    
    
    
    

    Here is the disassembly of the CLZ function for the Cortex-M0 core generated with IAR EWARM (high level of optimization):

        volatile uint32_t y = CLZ(x);
            0x272: 0x9800         LDR       R0, [SP]
        if (x >= (1U << 16)) {
            0x274: 0x2280         MOVS      R2, #128                ; 0x80
            0x276: 0x0252         LSLS      R2, R2, #9
            0x278: 0x4290         CMP       R0, R2
            0x27a: 0xd310         BCC.N     ??main_0                ; 0x29e
            if (x >= (1U << 24)) {
            0x27c: 0x0211         LSLS      R1, R2, #8
            0x27e: 0x4288         CMP       R0, R1
            0x280: 0xd306         BCC.N     ??main_1                ; 0x290
                if (x >= (1 << 28)) {
            0x282: 0x0109         LSLS      R1, R1, #4
            0x284: 0x4288         CMP       R0, R1
            0x286: 0xd301         BCC.N     ??main_2                ; 0x28c
                    n = 28U;
            0x288: 0x211c         MOVS      R1, #28                 ; 0x1c
            0x28a: 0xe014         B.N       ??main_3                ; 0x2b6
                    n = 24U;
    ??main_2:
            0x28c: 0x2118         MOVS      R1, #24                 ; 0x18
            0x28e: 0xe012         B.N       ??main_3                ; 0x2b6
                if (x >= (1U << 20)) {
    ??main_1:
            0x290: 0x0909         LSRS      R1, R1, #4
            0x292: 0x4288         CMP       R0, R1
            0x294: 0xd301         BCC.N     ??main_4                ; 0x29a
                    n = 20U;
            0x296: 0x2114         MOVS      R1, #20                 ; 0x14
            0x298: 0xe00d         B.N       ??main_3                ; 0x2b6
                    n = 16U;
    ??main_4:
            0x29a: 0x2110         MOVS      R1, #16                 ; 0x10
            0x29c: 0xe00b         B.N       ??main_3                ; 0x2b6
            if (x >= (1U << 8)) {
    ??main_0:
            0x29e: 0x28ff         CMP       R0, #255                ; 0xff
            0x2a0: 0xd904         BLS.N     ??main_5                ; 0x2ac
                if (x >= (1U << 12)) {
            0x2a2: 0x0912         LSRS      R2, R2, #4
            0x2a4: 0x4290         CMP       R0, R2
            0x2a6: 0xd206         BCS.N     ??main_3                ; 0x2b6
                    n = 8U;
            0x2a8: 0x2108         MOVS      R1, #8
            0x2aa: 0xe004         B.N       ??main_3                ; 0x2b6
                if (x >= (1U << 4)) {
    ??main_5:
            0x2ac: 0x2810         CMP       R0, #16                 ; 0x10
            0x2ae: 0xd301         BCC.N     ??main_6                ; 0x2b4
                    n = 4U;
            0x2b0: 0x2104         MOVS      R1, #4
            0x2b2: 0xe000         B.N       ??main_3                ; 0x2b6
                    n = 0U;
    ??main_6:
            0x2b4: 0x2100         MOVS      R1, #0
    ??main_3:
            0x2b6: 0x2220         MOVS      R2, #32                 ; 0x20
            0x2b8: 0x1a52         SUBS      R2, R2, R1
            0x2ba: 0x40c8         LSRS      R0, R0, R1
            0x2bc: 0xa129         ADR.N     R1, ??log2Lkup          ; 0x364
            0x2be: 0x5c08         LDRB      R0, [R1, R0]
            0x2c0: 0x1a10         SUBS      R0, R2, R0
            0x2c2: 0x9000         STR       R0, [SP]
    

    By my count, this code executes always in just 15 instructions. The number of clock cycles will be of course slightly bigger, especially for branch instructions that break the instruction pipeline.

Reply
  • If the goal is to reduce the code size, for the price of just one more comparison in the critical path of the code, the size of the log2Lkup lookup table can be reduced from 256 to just 16 bytes.

    So here is the deterministic code that takes about 94 bytes of code space, including the lookup table:

    static inline uint32_t CLZ(uint32_t x) {
        static uint8_t const log2Lkup[] = {
            0U, 1U, 2U, 2U, 3U, 3U, 3U, 3U, 4U, 4U, 4U, 4U, 4U, 4U, 4U, 4U
        };
        uint32_t n;
        if (x >= (1U << 16)) {
            if (x >= (1U << 24)) {
                if (x >= (1 << 28)) {
                    n = 28U;
                }
                else {
                    n = 24U;
                }
            }
            else {
                if (x >= (1U << 20)) {
                    n = 20U;
                }
                else {
                    n = 16U;
                }
            }
        }
        else {
            if (x >= (1U << 8)) {
                if (x >= (1U << 12)) {
                    n = 12U;
                }
                else {
                    n = 8U;
                }
            }
            else {
                if (x >= (1U << 4)) {
                    n = 4U;
                }
                else {
                    n = 0U;
                }
            }
        }
        return 32U - n - log2Lkup[x >> n];
    }
    
    
    
    

    Here is the disassembly of the CLZ function for the Cortex-M0 core generated with IAR EWARM (high level of optimization):

        volatile uint32_t y = CLZ(x);
            0x272: 0x9800         LDR       R0, [SP]
        if (x >= (1U << 16)) {
            0x274: 0x2280         MOVS      R2, #128                ; 0x80
            0x276: 0x0252         LSLS      R2, R2, #9
            0x278: 0x4290         CMP       R0, R2
            0x27a: 0xd310         BCC.N     ??main_0                ; 0x29e
            if (x >= (1U << 24)) {
            0x27c: 0x0211         LSLS      R1, R2, #8
            0x27e: 0x4288         CMP       R0, R1
            0x280: 0xd306         BCC.N     ??main_1                ; 0x290
                if (x >= (1 << 28)) {
            0x282: 0x0109         LSLS      R1, R1, #4
            0x284: 0x4288         CMP       R0, R1
            0x286: 0xd301         BCC.N     ??main_2                ; 0x28c
                    n = 28U;
            0x288: 0x211c         MOVS      R1, #28                 ; 0x1c
            0x28a: 0xe014         B.N       ??main_3                ; 0x2b6
                    n = 24U;
    ??main_2:
            0x28c: 0x2118         MOVS      R1, #24                 ; 0x18
            0x28e: 0xe012         B.N       ??main_3                ; 0x2b6
                if (x >= (1U << 20)) {
    ??main_1:
            0x290: 0x0909         LSRS      R1, R1, #4
            0x292: 0x4288         CMP       R0, R1
            0x294: 0xd301         BCC.N     ??main_4                ; 0x29a
                    n = 20U;
            0x296: 0x2114         MOVS      R1, #20                 ; 0x14
            0x298: 0xe00d         B.N       ??main_3                ; 0x2b6
                    n = 16U;
    ??main_4:
            0x29a: 0x2110         MOVS      R1, #16                 ; 0x10
            0x29c: 0xe00b         B.N       ??main_3                ; 0x2b6
            if (x >= (1U << 8)) {
    ??main_0:
            0x29e: 0x28ff         CMP       R0, #255                ; 0xff
            0x2a0: 0xd904         BLS.N     ??main_5                ; 0x2ac
                if (x >= (1U << 12)) {
            0x2a2: 0x0912         LSRS      R2, R2, #4
            0x2a4: 0x4290         CMP       R0, R2
            0x2a6: 0xd206         BCS.N     ??main_3                ; 0x2b6
                    n = 8U;
            0x2a8: 0x2108         MOVS      R1, #8
            0x2aa: 0xe004         B.N       ??main_3                ; 0x2b6
                if (x >= (1U << 4)) {
    ??main_5:
            0x2ac: 0x2810         CMP       R0, #16                 ; 0x10
            0x2ae: 0xd301         BCC.N     ??main_6                ; 0x2b4
                    n = 4U;
            0x2b0: 0x2104         MOVS      R1, #4
            0x2b2: 0xe000         B.N       ??main_3                ; 0x2b6
                    n = 0U;
    ??main_6:
            0x2b4: 0x2100         MOVS      R1, #0
    ??main_3:
            0x2b6: 0x2220         MOVS      R2, #32                 ; 0x20
            0x2b8: 0x1a52         SUBS      R2, R2, R1
            0x2ba: 0x40c8         LSRS      R0, R0, R1
            0x2bc: 0xa129         ADR.N     R1, ??log2Lkup          ; 0x364
            0x2be: 0x5c08         LDRB      R0, [R1, R0]
            0x2c0: 0x1a10         SUBS      R0, R2, R0
            0x2c2: 0x9000         STR       R0, [SP]
    

    By my count, this code executes always in just 15 instructions. The number of clock cycles will be of course slightly bigger, especially for branch instructions that break the instruction pipeline.

Children
No data