Here are two assembly language functions which perform division of 32-bit and 16-bit numbers by values in the range 1…16 inclusive. Although the range of the divisor is very limited, these functions are significantly faster than conventional division functions. The 32-bit division requires about 25% of the time required by C51's 32-bit division and the 16-bit division requires about 60% of the time required by C51's 16-bit division. (Even though the Keil division functions seem to be well optimised.) The 32-bit division function is sufficiently fast that it may called more than once to divide a 32-bit value by a large value that has convenient factors. For example, dividing by 60 by first dividing by ten and then by six. This will still be significantly faster than conventional division. The functions also return a remainder and may find useful application in radix conversion by repeated division by the radix. These functions use the interesting trick of deriving a pointer into the current register bank. This allows the functions to loop through the dividend making the code much more compact although at the slight cost of speed.
// // fast_long_divide() // // This function will divide an unsigned 32-value in registers // R4...R7 by a value in the range 1...16 in register R3. The // function uses the currently selected register bank. // // For Keil parameter passing compatibility, the divisor is loaded // into R3 from memory. // // This function requires about 25% of the processing time compared // with the already well optimised Keil compiler unsigned long // divide function and should be much faster than any general // purpose 32-bit by 32-bit divide. // // DPL is used, but its value is preserved on the stack. This is // not essential, but is frequently helpful for compiler // optimisation. // // This function also returns the remainder in R0...R3 // (R0...R2) will always be zero. // #pragma ASM $REGUSE _fast_long_divide( R0, R1, R2, R3, R4, R5, R6, R7, A, B ) #pragma ENDASM unsigned long int fast_long_divide( unsigned long int dividend, unsigned char divisor ) small { divisor = divisor; #pragma ASM MOV R3,?_fast_long_divide?BYTE+4 ?fast_long_divide: ; // dividend in: R4,R5,R6,R7 // divisor in: R3 // uses: R0, R1, R2, A, B and DPL // quotient returned in: R4,R5,R6,R7 // remainder returned in: R0,R1,R2,R3 PUSH DPL ; ; MOV A,PSW ;Get the base address of ANL A,#0x18 ; the current register bank. ORL A,#0x04 ;Point to register R4. MOV R0,A ;R0 is a pointer to R4. ; MOV DPL,#04 ;load loop counter. ; MOV R2,#00 ;Clear the running remainder. ; ?fast_long_divide_loop: ; ; MOV A,@R0 ;Get the MS nibble of the SWAP A ; dividend into the accumulator. ANL A,#0x0F ; ORL A,R2 ; ; MOV B,R3 ;Divide MS nibble by the divisor. DIV AB ; ; SWAP A ;Save partial result (which must be MOV R1,A ; in the range 0...15) shifted 4 bits. ; MOV A,B ;Save remainder (which must be in SWAP A ; the range 0...15) shifted 4 bits left. MOV R2,A ; ; MOV A,@R0 ;Get next nibble of the dividend into ANL A,#0x0F ; the accumulator. ORL A,R2 ;Or in the remainder from previous ; partial division. MOV B,R3 ;Divide by the divisor. DIV AB ; ; ORL A,R1 ;Or in the previously saved partial result. MOV @R0,A ;Save the MSB of the quotient ready ; to be returned. MOV A,B ;Save remainder shifted 4 bits left. SWAP A ; MOV R2,A ; ; INC R0 ; DJNZ DPL,?fast_long_divide_loop CLR A ;Sort out the registers so that MOV R0,A ; the remainder ends up in MOV R1,A ; R3 with the other registers XCH A,R2 ; (R0,R1,R2) being zero. MOV R3,A ; ; POP DPL ; ; RET ; #pragma ENDASM return( dividend ); }
// // fast_int_divide() // // This function will divide an unsigned 16-value in registers // R6/R7 by a value in the range 1...16 in register R5. The // function uses the currently selected register bank. // // This function requires about 60% of the processing time compared // with the already well optimised Keil compiler unsigned long // divide function and should be much faster than most general // purpose 16-bit by 16-bit divides. // // This function also returns the remainder in R4/R5 (R4 will // always be zero. // #pragma ASM $REGUSE _fast_int_divide( R0, R1, R3, R4, R5, R6, R7, A, B ) #pragma ENDASM unsigned int fast_int_divide( unsigned int dividend, unsigned char divisor ) small { // dividend in: R6,R7 // divisor in: R5 // uses: R0, R1, R3 // quotient returned in: R6,R7 // remainder returned in: R4,R5 divisor = divisor; #pragma ASM ?fast_int_divide: ; ; MOV A,R5 ; JZ ?tiny_divisor_end ;if divide by zero: terminate. ; MOV A,PSW ;Get the base address of ANL A,#0x18 ; the current register bank. ORL A,#0x06 ;Point to register R6. MOV R0,A ;R0 is a pointer to R6. ; MOV R1,#02 ;load loop counter. ; MOV R4,#00 ;Clear the running remainder. ; ?fast_int_divide_loop: ; ; MOV A,@R0 ;Get the MS nibble of the dividend SWAP A ; into the accumulator. ANL A,#0x0F ; ORL A,R4 ; ; MOV B,R5 ;Divide MS nibble by the divisor. DIV AB ; ; SWAP A ;Save partial result (which must be in MOV R3,A ; the range 0...15) shifted 4 bits. ; MOV A,B ;Save remainder (which must be in SWAP A ; the range 0...15) shifted 4 bits left. MOV R4,A ; ; MOV A,@R0 ;Get next nibble of the dividend ANL A,#0x0F ; into the accumulator. ORL A,R4 ;Or in the remainder from previous ; partial division. MOV B,R5 ;Divide by the divisor. DIV AB ; ; ORL A,R3 ;Or in the previously saved partial result. MOV @R0,A ;Save the MSB of the quotient in R6/R7 ; ready to be returned. MOV A,B ;Save remainder shifted 4 bits left. SWAP A ; MOV R4,A ; ; INC R0 ; DJNZ R1,?fast_int_divide_loop CLR A ;Put remainder in R4/R5 XCH A,R4 ; MOV R5,A ; ; ?tiny_divisor_end: ; ; RET ; ; #pragma ENDASM return( dividend ); }
And a test harness:
extern unsigned int fast_int_divide( unsigned int dividend, unsigned char divisor ) small; extern unsigned long int fast_long_divide( unsigned long int dividend, unsigned char divisor ) small; unsigned int compiler_int_divide( unsigned int dividend, unsigned char divisor ) { return( dividend / divisor ); } unsigned int new_int_divide( unsigned int dividend, unsigned char divisor ) { return( fast_int_divide( dividend, divisor ) ); } unsigned long int compiler_long_divide( unsigned long int dividend, unsigned char divisor ) { return( dividend / divisor ); } unsigned long int new_long_divide( unsigned long int dividend, unsigned char divisor ) { return( fast_long_divide( dividend, divisor ) ); } unsigned long int random() { static unsigned int x; x = ( ( 31 * x ) + 123 ); if( x == 0xFFFF ) x = 0; return(x); } void main( void ) { unsigned int loop; unsigned int d; unsigned int sd; unsigned int i; unsigned long int l; unsigned long int lr; unsigned char dummy; dummy = 0; lr = new_long_divide( new_long_divide( 0x12345678, 6 ), 10 ); for( loop = 0; loop < 0xFFFF; loop++ ) { i = random(); d = random(); if( d == 0 ) d = 1; l = ( (unsigned long int) ( random() & 0xFF ) << ( random() % 24 ) ); sd = random() % 17; // small divisor always in range 1...16 if( sd == 0 ) sd = 1; if( new_int_divide( i, sd ) != compiler_int_divide( i, sd ) ) { dummy++; } if( new_long_divide( l, sd ) != compiler_long_divide( l, sd ) ) { dummy++; } } }
I have posted updated versions of these functions here: http://www.programmersheaven.com/zone5/cat27/32144.htm The versions posted on the Keil forum contain a minor bug in that the remainder is not returned correctly. The versions at Programmers Heaven are correct and include a radix conversion function. I have posted some other nuggets which can be found here: http://www.programmersheaven.com/zone5/cat27/index.htm Each of my contributions has the epithet fast.