32x32 bit divide

The ICC12 library 32-bit by 32-bit divide requires more than 2500 cycles.  It is your typical shift-subtract algorithm that does not make use of the efficiencies of the divide routines in the HC12 instruction set.  The following function executes in 314 cycles or less.  Random tests have run successfully since May 2010.  Note that the arguments of uldiv must point to separate memory locations (ie: uldiv(&a, &b, &a) will cause an error).

Typical usage:

int retval;
unsigned long numer, denom, quot;
numer = 0xAAAAAAAA;
denom = 0xA0AAB;
retval = uldiv(&numer, &denom, &quot); // quot = 0x10FE
; sp index after locals allocated
rem = 0 ; remainder (32-bit)
den = rem + 4 ; denominator copy (32-bit)
num = den + 4 ; trial numerator (32-bit)
np = num + 4 ; numerator pointer
ra = np + 2 ; return address
dp = ra + 2 ; denominator pointer
qp = dp + 2 ; quotient pointer
 
_uldiv::
; ICC12V7 parameter locations upon entry to this subroutine:
; *quot -> 4,sp
; *denom -> 2,sp
; RA -> 0,sp
; *numer -> accD
 
  pshd ; push pointer to numer onto stack
  ldx 0,s ; load address of numer
  movw 2,x,2,-s ; push loword of numer onto stack
  movw 0,x,2,-s ; push hiword of numer onto stack
  ldx 8,s ; load address of denom
  movw 2,x,2,-s ; push loword of denom onto stack
  movw 0,x,2,-s ; push hiword of denom onto stack
  leas -4,s ; reserve stack space for remainder
  ldx qp,s ; load address of quotient in x
 
  ; test for div by zero
  ldd den,s ; load hiword of denom
  ora den+2,s ; OR with loword of denom
  orb den+3,s
  cpd #0 ; subtract zero from accD
  bne uldiv_0 ; branch if Z bit not set
 
  ; denom = 0, quot = 0xFFFFFFFF
  ldd #65535
  std 0,x ; write hiword of quotient
  std 2,x ; write loword of quotient
  ldd #0 ; return false
  leas 14,s ; restore stack pointer
  rts
 
uldiv_0:
  ; clear quotient var
  clra
  clrb
  std 0,x ; clear hiword of quotient
  std 2,x ; clear loword of quotient
  std rem,s ; clear hiword of remainder
  std rem+2,s ; clear loword of remainder
 
  ; compare numer and denom
  ldd num+2,s ; load loword of numer
  subd den+2,s ; subtract loword of denom
  beq uldiv_2 ; branch if loword of numer = loword of denom
  ldd num,s ; load hiword of numer
uldiv_1:
  sbcb den+1,s ; subtract hiword of denom (with carry)
  sbca den,s
  bcc uldiv_3 ; numer > denom
  ldd #0 ; denom > numer - return false, q = 0
  leas 14,s ; restore stack pointer
  rts
 
uldiv_2:
  ldd num,s ; load hiword of numer
  cpd den,s ; compare to hiword of denom
  bne uldiv_1 ; branch if numer != denom
  ldd #1 ; numer = denom - return true, q = 1
  std 2,x ; write loword of quotient
  leas 14,s ; restore stack pointer
  rts
 
uldiv_3: ; numer > denom
  ldaa den,s ; test hibyte of denom hiword
  beq uldiv_4 ; branch if denom >= 24-bit
 
  ; 32-bit denominator
  ; ABCD/abcd where a != 0
  exg x,y ; preserve address of quotient in Y
  ldd num,s ; load numer hiword
  ldx den,s ; load denom hiword
  idiv   ; accD/X => X; rem => accD
  exg y,x ; X -> Y for emul op - restore quotient address in X
  sty 2,x ; write loword of quotient
  std 8,s ; overwrite numer hiword
 
  ; Check updated numer larger than remainder
  ldd den+2,s ; load denom loword
  emul   ; accD*Y => Y:accD
  std rem+2,s ; store remainder in rem loword
  sty rem,s ; store quot in rem hiword
  ldd num+2,s ; load numer hiword
  subd rem+2,s ; subt 2+rem
  ldd num,s ; load numer hiword
  sbcb rem+1,s ; subt 1+rem (with carry)
  sbca rem,s ; subt rem (with carry)
  bcc x_div_32
 
  ; decrement quotient if rem greater than numer
  ldy 2,x ; load loword of quotient
  dey
  sty 2,x ; write loword of quotient
x_div_32:
  ldd #1 ; return true
  leas 14,s ; restore stack pointer
  rts
 
uldiv_4:
  ldd den+1,s ; bc -> accD
  tsta   ; test if b byte of denom is zero
  bne uldiv_24 ; 24-bit denominator
 
  ; 16-bit denominator
  ; ABCD/abcd where ab=0
  tfr x,y ; load y with address of quotient
  ldd num,s ; load hiword of numer
  ldx den+2,s ; load loword of denom
  idiv   ; accD/X => X; rem => accD
  stx 0,y ; write hiword of quotient
  exg d,y ; transfer remainder to Y
  ldd num+2,s ; load loword of numer
  ldx den+2,s ; load loword of denom
  ediv   ; Y:accD / X => Y ; rem =>accD
  ldx qp,s ; load address of quotient
  sty 2,x ; write loword of quotient
  ldd #1 ; return true
  leas 14,s ; restore stack pointer
  rts
 
uldiv_24:
  ; 24-bit denominator
  ; ABCD/abcd where a=0
  tfr d,x ; den+1,s in accD from uldiv_4 test
  ldd num,s ; load hiword of numer
  idiv ; accD/X => X; rem => accD
  std num,s ; overwrite hiword of numer hiword with remainder (provisional numerator)
  exg d,x ; quotient result of idiv -> accD
  ldx qp,s ; load address of quotient
  stab 2,x ; store 8-bit quotient of IDIV to hibyte of quotient loword
  tstb
  beq no_ovf_1 ; if zero quotient no need to test (d * quotient) overflow
 
  ; Check for (d * quotient) overflow
  ldaa den+3,s
  mul ; accA*accB => accD
  std rem+1,s ; store to 1+rem
  ldd num+2,s ; load loword of numer
  subd rem+2,s ; subtract loword of rem
  ldd num,s ; load hiword of numer
  sbcb rem+1,s ; subtract hiword of rem (with carry)
  sbca rem,s
  bpl no_ovf_0 ; (d * quotient) smaller than than provisional remainder
 
  ; Overflow - decrement quotient to correct value
  ldab 2,x ; load hibyte of quotient loword
  decb
  stab 2,x ; update hibyte of quotient loword
 
  ; Fix remainder temp
  clra
  ldy den+1,s ; load 1+denom
  emul ; accD*Y => Y:accD
  std num,s ; temp storage of remainder in numer temp hiword
  ldx np,s ; load address of numer
  ldd 0,x ; Original AB value since numer temp hiword overwritten
  subd num,s
  std num,s ; Corrected value in temp numer hiword
 
  ; normal remainder subtraction of (d * quotient)
no_ovf_0:
  ldx qp,s ; load address of quot
no_ovf_1:
  ; Zero remainder var
  clra
  clrb
  std rem,s
  std rem+2,s
 
  ; Subtract with carry (d * quotient)
  ldab 2,x ; load hibyte of quotient loword
  ldaa den+3,s ; load denom lsbyte
  mul ; accA*accB => accD
  std rem+1,s ; store to 1+rem
  ldd num+1,s ; load 1+numer
  subd rem+1,s
  std num+1,s ; overwrite 1+numer
  ldab num,s ; load numer msbyte
  sbcb #0 ; subt carry bit
  stab num,s
 
  clra ; accD = accA:accB = 0:(msb of numer)
  exg d,y ; accD => Y
  ldd num+1,s ; load 1+numer
  ldx den+1,s ; load 1+denom
  ediv ; Y:accD / X => Y ; rem =>accD (rem not to be used!)
 
  ; Test quotient to be max 8-bit
  ldx qp,s ; load address of quot
  cpy #255
  bcs no_ovf2
  beq no_ovf2
  ; store 0xFF to lobyte of quotient loword & quit
  ldab #255
  stab 3,x
  bra x_uldiv
 
no_ovf2:
  ; Zero remainder var
  clra
  clrb
  std rem,s
  std rem+2,s
 
  ; EDIV remainder unusable - EMUL required to create correct remainder
  exg d,y ; ediv quotient -> accD (accA = 0)
  stab 3,x ; store 8-bit quotient of EDIV to lobyte of quotient loword
  ldy den+1,s ; load 1+denom
  ; Build correct remainder
  emul ; accD*Y => Y:accD
  std rem+1,s ; store to 1+rem
  exg d,y ; hiword of emul product in accD
  stab rem,s ; store to msbyte of remainder
 
  ldaa den+3,s ; load lsbyte of denom
  ldab 3,x ; load lsbyte of quotient
  mul ; accA*accB => accD
  addd rem+2,s ; add loword of remainder
  std rem+2,s ; update loword of remainder
  ldd rem,s ; load hiword of remainder
  adcb #0 ; add carry bit
  adca #0
  std rem,s ; update hiword of remainder
 
  ldd num+2,s ; load loword of temp numer
  subd rem+2,s ; subtract loword of remainder
  ldd num,s ; load hiword of temp numer
  sbcb rem+1,s ; subtract hi word of remainder (with carry)
  sbca rem,s
  bpl x_uldiv ; branch if positive
 
  ; decrement quotient if rem larger than numer
  ldab 3,x ; load lsbyte of quotient
  decb
  stab 3,x ; update lsbyte of quotient
 
x_uldiv:
  ldd #1 ; return true
  leas 14,s ; restore stack pointer
  rts