;;; Assert that all odd divisors less than 32769 have an optimized
;;; remainder coefficient using 32-bit operations.
;;; Symbol hashsets with fewer than that many cells can use fast remainder.
;;; (I could/should implement the 64-bit algorithm I suppose)
(with-test (:name :optimized-remainder-exists)
  (loop for divisor from 3 below 32769 by 2
        do
     (multiple-value-bind (mask c) (sb-impl::optimized-symtbl-remainder-params divisor)
       (assert (and (plusp c) (plusp mask))))))

#+interpreter (invoke-restart 'run-tests::skip-file)

(defvar *rs* (make-random-state t)) ; get a random random-state

#+x86-64
(with-test (:name :udiv-magic)
  (dotimes (i 1000)
    (let ((divisor (+ 5 (random #xfffffff *rs*))))
      (multiple-value-bind (m a s) (sb-c:compute-udiv32-magic divisor)
        (dotimes (i 10000)
          (let* ((dividend (random (ash 1 32) *rs*))
                 (quotient (sb-vm::udiv32-via-multiply dividend m a s))
                 (expected (truncate dividend divisor)))
            (assert (= quotient expected))))))))

#+x86-64
(with-test (:name :urem-magic)
  (dotimes (i 1000)
    (let ((divisor (+ 5 (random #xfffffff *rs*))))
      (multiple-value-bind (m a s) (sb-c:compute-udiv32-magic divisor)
        (dotimes (i 10000)
          (let* ((dividend (random (ash 1 32) *rs*))
                 (remainder (sb-vm::urem32-via-multiply dividend divisor m a s))
                 (expected (rem dividend divisor)))
            (assert (= remainder expected))))))))

(with-test (:name :urem32-ultrafast)
  (dotimes (i 1000)
    ;; The 32-bit algorithm can only handle random 16-bit inputs (or some inputs
    ;; with more than 16 bits, but it's random which inputs are OK)
    ;; A 64-bit variant would handle all 32-bit inputs, but I don't need it.
    (let* ((divisor (+ 5 (random #xffff *rs*)))
           (c (sb-c:compute-fastrem-coefficient divisor 16 32)))
      (dotimes (i 20000)
        (let* ((dividend (random (ash 1 16) *rs*))
               (remainder (sb-vm::fastrem-32 dividend c divisor))
               (expected (rem dividend divisor)))
          (assert (= remainder expected)))))))

#+64-bit
(with-test (:name :urem64-ultrafast)
  (dotimes (i 1000)
    (let* ((divisor (+ 5 (random #xfffffff *rs*)))
           (c (sb-c:compute-fastrem-coefficient divisor 32 64)))
      (dotimes (i 20000)
        (let* ((dividend (random (ash 1 32) *rs*))
               (remainder (sb-vm::fastrem-64 dividend c divisor))
               (expected (rem dividend divisor)))
          (assert (= remainder expected)))))))

;;;; The assembly code for the benchmarks contains no full calls.
;;;; Therefore we're measuring the speed of the vops as accurately as possible.

;;; These loops show the advantage of division by using multiplication
;;; even when the CPU directly implements division.
;;; BASIC-WAY takes about 4 to 6 times as many CPU cycles as TRICKY-WAY.
;;; But these loops are too time-consuming for a regression test.
(defun div-basic-way (&aux (result 0))
  (declare (fixnum result))
  (loop for divisor from 3 to 1000
        do (dotimes (dividend #xfffff)
             (let ((ans (truncate dividend divisor)))
               (setq result (logxor result ans)))))
  result)

#+x86-64
(defun div-tricky-way (&aux (result 0))
  (declare (fixnum result))
  (loop for divisor from 3 to 1000
        do (multiple-value-bind (m a s) (sb-c:compute-udiv32-magic divisor)
             (dotimes (dividend #xfffff)
               (let ((ans (sb-vm::udiv32-via-multiply dividend m a s)))
                 (setq result (logxor result ans))))))
  result)

;;; * (time (rem-basic-way))
;;; Evaluation took:
;;;   30.123 seconds of real time
;;;   30.112920 seconds of total run time (30.112919 user, 0.000001 system)
;;;   99.97% CPU
;;;   84,142,880,093 processor cycles
(defun rem-basic-way (&aux (result 0)) ; about 32 CPU cycles per operation
  (declare (fixnum result))
  (loop for divisor from 3 to 10000
        do (dotimes (dividend #x3ffff)
             (let ((ans (rem dividend divisor)))
               (setq result (logxor result ans)))))
  result)

;;; * (time (rem-tricky-way))
;;; Evaluation took:
;;;   9.999 seconds of real time
;;;   9.998409 seconds of total run time (9.994458 user, 0.003951 system)
;;;   99.99% CPU
;;;   27,938,111,662 processor cycles
#+x86-64
(defun rem-tricky-way (&aux (result 0)) ; about 11 CPU cycles per operation
  (declare (fixnum result))
  (loop for divisor from 3 to 10000
        do (multiple-value-bind (m a s) (sb-c:compute-udiv32-magic divisor)
             (dotimes (dividend #x3ffff)
               (let ((ans (sb-vm::urem32-via-multiply dividend divisor m a s)))
                 (setq result (logxor result ans))))))
  result)

;;; * (time (ultrafastrem-way))
;;; Evaluation took:
;;;   6.175 seconds of real time
;;;   6.171059 seconds of total run time (6.171059 user, 0.000000 system)
;;;   99.94% CPU
;;;   17,243,611,816 processor cycles
(defun ultrafastrem-way (&aux (result 0)) ; about 7 CPU cycles per operation
  (declare (fixnum result))
  (loop for divisor from 3 to 10000
        do (let ((c (sb-c:compute-fastrem-coefficient divisor 18 32)))
             (dotimes (dividend #x3ffff)
               (let ((ans (sb-vm::fastrem-32 dividend c divisor)))
                 (setq result (logxor result ans))))))
  result)

;;; This is the generalized code for the ultrafast way, which isn't actually
;;; ultrafast, but it shows the correctness of the algorithm at various precisions.
(defun try-fastrem (divisor nbits)  ;; nbits = number of bits in numerator
  (macrolet ((try (fraction-bits)
               `(multiple-value-bind (fraction-bits c)
                    (sb-c:compute-fastrem-coefficient divisor nbits ,fraction-bits)
                  ;;(format t "~&F=~D~%" fraction-bits)
                  (dotimes (numerator (ash 1 nbits)) ; exhaustively test all possible numerators
                    (let* ((frac (ldb (byte fraction-bits 0) (* c numerator)))
                           (answer (ash (* frac divisor) (- fraction-bits)))
                           (expect  (mod numerator divisor)))
                      (unless (= answer expect)
                        (format t "~12,'0b ~12,'0b  ~3d ~3d~%"
                                numerator frac answer (mod numerator divisor))))))))
    (try :variable)
    ;(try 16)
    (try 32)))

(defun try-fastrem-bigly ()
  (loop for divisor from 3 to 10000
        do (format t "divisor=~d~%" divisor)
            (try-fastrem divisor 18)))