/*---------------------------------------------------------------------------*\

    Project   : ARM Divide Emulation
    Author    : Henry Thomas <http://henri.net>
    Creation  : 12/7/2005
    Copyright 2005 Henry Thomas

\*---------------------------------------------------------------------------*/

#include "arm_version.h"

#ifndef __arm__
#error "This implementation of div requires an arm cpu target"
#endif

#define EARLY_OUT 1

#if (ARM_VERSION_ > 4)
#   define USE_CLZ 1
#endif

int32_t div(register int32_t numerator, register int32_t denominator)
{
    register int32_t quotient;

    asm("num     .req %[numerator]      @ map register equates" NLT
        "den     .req %[denominator]" NLT
        "mod     .req r2" NLT
        "cnt     .req r3" NLT
        "sign    .req r12" NLT

        "cmp den, #0                    @ exceptioin if den == zero" NLT
        "beq .div0" NLT

        "eor sign, num, den             @ sign = num ^ den" NLT
        "rsbmi den, den, #0             @ den = -den if den < 0" NLT

        "subs mod, den, #1              @ mod = den - 1" NLT
        "beq .div1                      @ return if den == 1" NLT

        "movs cnt, num                  @ num = -num if num < 0" NLT
        "rsbmi num, num, #0" NLT

        "cmp num, den                   @ return if num <= den" NLT
        "bls .numLeDen" NLT

        "tst den, mod                   @ if(den & (den - 1) == 0)" NLT
        "beq .powerOf2                  @ den is power of 2" NLT

#ifdef EARLY_OUT
#ifdef USE_CLZ

        "clz mod, den" NLT
        "clz cnt, num" NLT
        "sub mod, mod, cnt" NLT
        "rsbs cnt, mod, #31" NLT
        "rsb mod, cnt, #32" NLT
        "mov mod, num, lsr mod" NLT
        "mov num, num, lsl cnt" NLT

#else /* For reasons I can't explain, this code is faster */

        "mov cnt, #28                   @ count difference in leading zeros" NLT
        "mov mod, num, lsr #4           @ between num and den" NLT

        "cmp den, mod, lsr #12" NLT
        "subls cnt, cnt, #16" NLT
        "movls mod, mod, lsr #16" NLT

        "cmp den, mod, lsr #4" NLT
        "subls cnt, cnt, #8" NLT
        "movls mod, mod, lsr #8" NLT

        "cmp den, mod" NLT
        "subls cnt, cnt, #4" NLT
        "movls mod, mod, lsr #4" NLT

        "mov num, num, lsl cnt          @ mod:num = num << cnt " NLT

#endif /* ARM_VERSION_ > 4 */

        "rsb den, den, #0               @ negate den for divide loop" NLT
        "adds num, num, num             @ start: num = mod:num / den" NLT
        "add cnt, cnt, cnt, lsl #1      @ cnt *= 3 " NLT
        "add pc, pc, cnt, lsl #2        @ skip cnt iterations" NLT
        "nop                            @ nop instruction to take care of pipelining" NLT

#else

        "mov mod, #0" NLT
        "rsb den, den, #0               @ negate den for divide loop" NLT
        "adds num, num, num             @ start: num = mod:num / den" NLT

#endif /* EARLY_OUT */

        ".rept 32                       @ inner loop x 32" NLT
        "    adcs mod, den, mod, lsl #1" NLT
        "    subcc mod, mod, den" NLT
        "    adcs num, num, num" NLT
        ".endr" NLT

        "cmp sign, #0                   @ negate quotient if sign < 0" NLT
        "rsblt num, num, #0" NLT
        ARM_RETURN

    "\n.div0:" NLT
        "mov num, #0" NLT
        ARM_RETURN

    "\n.div1:" NLT
        "cmp sign, #0" NLT
        "rsbmi num, num, #0" NLT
        ARM_RETURN

    "\n.numLeDen:" NLT
        "mov num, #0                    @ quotient = 0 if num < den" NLT
        "moveq num, sign, asr #31       @ negate quotient if sign < 0" NLT
        "orreq num, num, #1             @ quotient = 1 if num == den" NLT
        ARM_RETURN


    "\n.powerOf2:" NLT

#if (ARM_VERSION_ > 4)

        "clz cnt, den                   @ count bits in den" NLT
        "rsb cnt, cnt, #31" NLT

#else

        "mov cnt, #0                    @ count bits in den" NLT

        "cmp den, #(1 << 16)" NLT
        "movhs cnt, #16" NLT
        "movhs den, den, lsr #16" NLT

        "cmp den, #(1 << 8)" NLT
        "addhs cnt, cnt, #8" NLT
        "movhs den, den, lsr #8" NLT

        "cmp den, #(1 << 4)" NLT
        "addhs cnt, cnt, #4" NLT
        "movhs den, den, lsr #4" NLT

        "cmp den, #(1 << 2)" NLT
        "addhi cnt, cnt, #3" NLT
        "addls cnt, cnt, den, lsr #1" NLT

#endif /* ARM_VERSION_ > 4 */

        "mov num, num, lsr cnt          @ num >>= cnt" NLT
        "cmp sign, #0" NLT
        "rsbmi num, num, #0             @ negate quotient if sign < 0"

         /* output registers */
        : [quotient] "=r" (quotient)
        /* input registers */
        : [numerator] "0" (numerator), [denominator] "r" (denominator)
        /* clobbered registers */
        : "r2" /* mod */, "r3" /* cnt */, "r12" /* sign */);
    return quotient;
}
