64-bit Square Root

Optimized for the architecture of the HC12.  Worst case execution time is approximately 24000 cycles (.6msec on a 40MHz xgate part). 

I found a 32-bit square root algorithm based on a table lookup that executes in less than 1600 cycles worst case on a HC12 (using my uldiv routine) at http://azillionmonkeys.com/qed/sqroot.html.

uint32 longlong_sqrt(uint64 *n) {
  uint32 root = 0;
  uint32 rem = 0;
  uint32 result = 0;
  uint8  index = 32;
  uint16 *wptr;
  uint8  *bptr;
  uint8  shift, i;	

  // Set byte pointer and index value
  bptr = (uint8*)n;
  wptr = (uint16*)n;
  for(i = 0; i < 4; i++) {
    if(*wptr == 0) {
      index -= 8;
      bptr++;
    } else {
      if(*bptr != 0) {
        if(*wptr >= 0x1000) {
          if(*wptr >= 0x4000) {
            shift = 6;
            break;
          }
          index--;
          shift = 4;
          break;
        } else {
          index -= 2;
          if(*wptr >= 0x400) {
            shift = 2;
            break;
          }
          index--;
          shift = 0;
          break;
        }
      } else {
        index -= 4;
        bptr++;
        if(*wptr >= 0x10) {
          if(*wptr >= 0x40) {
            shift = 6;
            break;
          }
          index--;
          shift = 4;
          break;
        } else {
          index -= 2;
          if(*wptr >= 0x4) {
            shift = 2;
            break;
          }
          index--;
          shift = 0;
          break;
        }
      }
    }

    bptr++;
    wptr++;
  }
	
  while(index) {
    root <<= 1;
    root++;
    rem <<= 2;
    rem += (uint32)((*bptr >> shift) & 3);	

    if(root <= rem) {
      rem -= root;
      root++;
      result += 1ul << (index - 1);
    } else {
      root--;
    }

    if(!shift) {
      shift = 6;
      bptr++;
    } else {
      shift -= 2;
    }
    index--;
  }

  return result;
}