Optimized for the architecture of the HC12. Worst case execution time is approximately 24000 cycles (.6msec on a 40MHz xgate part).
I found a 32-bit square root algorithm based on a table lookup that executes in less than 1600 cycles worst case on a HC12 (using my uldiv routine) at http://azillionmonkeys.com/qed/sqroot.html.
uint32 longlong_sqrt(uint64 *n) { uint32 root = 0; uint32 rem = 0; uint32 result = 0; uint8 index = 32; uint16 *wptr; uint8 *bptr; uint8 shift, i; // Set byte pointer and index value bptr = (uint8*)n; wptr = (uint16*)n; for(i = 0; i < 4; i++) { if(*wptr == 0) { index -= 8; bptr++; } else { if(*bptr != 0) { if(*wptr >= 0x1000) { if(*wptr >= 0x4000) { shift = 6; break; } index--; shift = 4; break; } else { index -= 2; if(*wptr >= 0x400) { shift = 2; break; } index--; shift = 0; break; } } else { index -= 4; bptr++; if(*wptr >= 0x10) { if(*wptr >= 0x40) { shift = 6; break; } index--; shift = 4; break; } else { index -= 2; if(*wptr >= 0x4) { shift = 2; break; } index--; shift = 0; break; } } } bptr++; wptr++; } while(index) { root <<= 1; root++; rem <<= 2; rem += (uint32)((*bptr >> shift) & 3); if(root <= rem) { rem -= root; root++; result += 1ul << (index - 1); } else { root--; } if(!shift) { shift = 6; bptr++; } else { shift -= 2; } index--; } return result; }