Unhandled SIMPLE-ERROR: The assertion (< (LENGTH HISTOGRAM) 10) failed with
(LENGTH HISTOGRAM) = 10.
On Fri, Nov 4, 2022 at 8:41 PM Douglas Katzman via Sbcl-commits
<
sbcl-c...@lists.sourceforge.net> wrote:
>
> The branch "master" has been updated in SBCL:
> via 0492f5bc2c807448d9d6cbd5d22af37bb9fcfae7 (commit)
> from cf5c2d6d9647c99ee4b8569227122a8c5b671220 (commit)
>
> - Log -----------------------------------------------------------------
> commit 0492f5bc2c807448d9d6cbd5d22af37bb9fcfae7
> Author: Douglas Katzman <
do...@google.com>
> Date: Fri Nov 4 13:27:19 2022 -0400
>
> Implement a general hashset, use it for pathname cache
>
> This change resulted in a huge time saving in INTERN-PATHNAME. Tested on
> the same operations, the former hash algorithm required as many as 120 probes
> worst case, where this hashset needed at most 8 probes, with the mean
> being between 2 and 3 probes.
>
> Also, it's threadsafe for reading (but not truly concurrent- just not
> unsafe), so it does not use a mutex for lookups except if not found,
> in which case it retries with a lock and possibly insertion.
> The memory use is typically under half that of a hash-table having the
> same keys, and it also measurably outperforms a weak hash-table.
> ---
> float-math.lisp-expr | 6 +
> src/code/cold-init.lisp | 7 +-
> src/code/gc.lisp | 2 -
> src/code/hashset.lisp | 406 +++++++++++++++++++++++++++++++++++++++++
> src/code/target-pathname.lisp | 229 ++++++-----------------
> src/cold/build-order.lisp-expr | 1 +
> src/pcl/defs.lisp | 2 +
> tests/hashset.pure.lisp | 146 +++++++++++++++
> tests/pathnames.impure.lisp | 96 +---------
> tests/run-tests.lisp | 5 +-
> 10 files changed, 628 insertions(+), 272 deletions(-)
>
> diff --git a/float-math.lisp-expr b/float-math.lisp-expr
> index 54176dba4..0cdd4d7de 100644
> --- a/float-math.lisp-expr
> +++ b/float-math.lisp-expr
> @@ -281,11 +281,13 @@
> (/ (#x0 #.(MAKE-SINGLE-FLOAT #x3089705F)) #.(MAKE-SINGLE-FLOAT #x0))
> (/ (#x0 #.(MAKE-SINGLE-FLOAT #x3F800000)) #.(MAKE-SINGLE-FLOAT #x0))
> (/ (#x0 #.(MAKE-SINGLE-FLOAT #x49742400)) #.(MAKE-SINGLE-FLOAT #x0))
> +(/ (#x0 #.(MAKE-SINGLE-FLOAT #x5E800000)) #.(MAKE-SINGLE-FLOAT #x0))
> (/ (#x0 #.(MAKE-DOUBLE-FLOAT #x3FE62E42 #xFEFA39EF)) #.(MAKE-DOUBLE-FLOAT #x0 #x0))
> (/ (#x0 #.(MAKE-DOUBLE-FLOAT #x3FF00000 #x0)) #.(MAKE-DOUBLE-FLOAT #x0 #x0))
> (/ (#x0 #.(MAKE-DOUBLE-FLOAT #x3FF71547 #x652B82FE)) #.(MAKE-DOUBLE-FLOAT #x0 #x0))
> (/ (#x0 #.(MAKE-DOUBLE-FLOAT #x40000000 #x0)) #.(MAKE-DOUBLE-FLOAT #x0 #x0))
> (/ (#x0 #.(MAKE-DOUBLE-FLOAT #x40240000 #x0)) #.(MAKE-DOUBLE-FLOAT #x0 #x0))
> +(/ (#x0 #.(MAKE-DOUBLE-FLOAT #x43D00000 #x0)) #.(MAKE-DOUBLE-FLOAT #x0 #x0))
> (/ (#x1 #.(MAKE-SINGLE-FLOAT #x3089705F)) #.(MAKE-SINGLE-FLOAT #x4E6E6B28))
> (/ (#xF423F #.(MAKE-SINGLE-FLOAT #x49742400)) #.(MAKE-SINGLE-FLOAT #x3F7FFFEF))
> (/ (#xFFFFFFFF #.(MAKE-SINGLE-FLOAT #x3F000000)) #.(MAKE-SINGLE-FLOAT #x50000000))
> @@ -540,6 +542,7 @@
> (< (#.(MAKE-SINGLE-FLOAT #x7F7FFFFF) #xFFFFFFFFFFFFFFC) NIL)
> (< (#.(MAKE-SINGLE-FLOAT #x7F7FFFFF) #xFFFFFFFFFFFFFFF) NIL)
> (< (#.(MAKE-SINGLE-FLOAT #x7F7FFFFF) #x3FFFFFC000000000) NIL)
> +(< (#.(MAKE-SINGLE-FLOAT #x7F7FFFFF) #x3FFFFFFFFFFFFFF9) NIL)
> (< (#.(MAKE-SINGLE-FLOAT #x7F7FFFFF) #x3FFFFFFFFFFFFFFB) NIL)
> (< (#.(MAKE-SINGLE-FLOAT #x7F7FFFFF) #x3FFFFFFFFFFFFFFC) NIL)
> (< (#.(MAKE-SINGLE-FLOAT #x7F7FFFFF) #x3FFFFFFFFFFFFFFF) NIL)
> @@ -741,6 +744,7 @@
> (< (#.(MAKE-DOUBLE-FLOAT #x7FEFFFFF #xFFFFFFFF) #xFFFFFFFFFFFFFFF) NIL)
> (< (#.(MAKE-DOUBLE-FLOAT #x7FEFFFFF #xFFFFFFFF) #x3FFFFFFFFFFFFE00) NIL)
> (< (#.(MAKE-DOUBLE-FLOAT #x7FEFFFFF #xFFFFFFFF) #x3FFFFFFFFFFFFFC0) NIL)
> +(< (#.(MAKE-DOUBLE-FLOAT #x7FEFFFFF #xFFFFFFFF) #x3FFFFFFFFFFFFFF9) NIL)
> (< (#.(MAKE-DOUBLE-FLOAT #x7FEFFFFF #xFFFFFFFF) #x3FFFFFFFFFFFFFFF) NIL)
> (< (#.(MAKE-DOUBLE-FLOAT #x7FEFFFFF #xFFFFFFFF) #x7FFFFFFFFFFFFFFF) NIL)
> (< (#.(MAKE-DOUBLE-FLOAT #x7FEFFFFF #xFFFFFFFF) #xFFFFFFFFFFFFFFFF) NIL)
> @@ -3160,6 +3164,8 @@
> (COERCE (#x3FFFFFC000000000 SINGLE-FLOAT) #.(MAKE-SINGLE-FLOAT #x5E7FFFFF))
> (COERCE (#x3FFFFFFFFFFFFE00 DOUBLE-FLOAT) #.(MAKE-DOUBLE-FLOAT #x43CFFFFF #xFFFFFFFF))
> (COERCE (#x3FFFFFFFFFFFFFC0 DOUBLE-FLOAT) #.(MAKE-DOUBLE-FLOAT #x43D00000 #x0))
> +(COERCE (#x3FFFFFFFFFFFFFF9 DOUBLE-FLOAT) #.(MAKE-DOUBLE-FLOAT #x43D00000 #x0))
> +(COERCE (#x3FFFFFFFFFFFFFF9 SINGLE-FLOAT) #.(MAKE-SINGLE-FLOAT #x5E800000))
> (COERCE (#x3FFFFFFFFFFFFFFB SINGLE-FLOAT) #.(MAKE-SINGLE-FLOAT #x5E800000))
> (COERCE (#x3FFFFFFFFFFFFFFC SINGLE-FLOAT) #.(MAKE-SINGLE-FLOAT #x5E800000))
> (COERCE (#x3FFFFFFFFFFFFFFF DOUBLE-FLOAT) #.(MAKE-DOUBLE-FLOAT #x43D00000 #x0))
> diff --git a/src/code/cold-init.lisp b/src/code/cold-init.lisp
> index 8348a50ab..05dd10dea 100644
> --- a/src/code/cold-init.lisp
> +++ b/src/code/cold-init.lisp
> @@ -169,6 +169,7 @@
> (/show0 "about to SHOW-AND-CALL !GLOBALDB-COLD-INIT")
> (show-and-call !globaldb-cold-init)
> (show-and-call !function-names-init)
> + (show-and-call !pathname-cold-init)
>
> ;; And now *CURRENT-THREAD*
> (sb-thread::init-main-thread)
> @@ -281,7 +282,7 @@
> (makunbound 'sb-c::*queued-proclaims*)
>
> (show-and-call os-cold-init-or-reinit)
> - (show-and-call !pathname-cold-init)
> + (show-and-call !lpn-cold-init)
>
> (show-and-call stream-cold-init-or-reset)
> (/show "Enabled buffered streams")
> @@ -403,12 +404,12 @@ process to continue normally."
> (when total ; newly started process, and not a failed save attempt
> (sb-thread::init-main-thread)
> #+x86-64 (sb-vm::validate-asm-routine-vector)
> - (rebuild-package-vector)
> - (rebuild-pathname-cache))
> + (rebuild-package-vector))
> ;; Initializing the standard streams calls ALLOC-BUFFER which calls FINALIZE
> (finalizers-reinit)
> ;; Initialize streams next, so that any errors can be printed
> (stream-reinit t)
> + (rebuild-pathname-cache)
> (os-cold-init-or-reinit)
> #-(and win32 (not sb-thread))
> (signal-cold-init-or-reinit)
> diff --git a/src/code/gc.lisp b/src/code/gc.lisp
> index f6b368cb5..ab5555d4e 100644
> --- a/src/code/gc.lisp
> +++ b/src/code/gc.lisp
> @@ -219,8 +219,6 @@ run in any thread.")
> (defun post-gc ()
> (sb-impl::finalizer-thread-notify)
> (alien-funcall (extern-alien "empty_thread_recyclebin" (function void)))
> - ;; This is probably the same as detecting a change in *GC-EPOCH*. Maybe remove?
> - (setq sb-impl::*pn-cache-force-recount* t)
> ;; Post-GC actions are invoked synchronously by the GCing thread,
> ;; which is an arbitrary one. If those actions aquire any locks, or are sensitive
> ;; to the state of *ALLOW-WITH-INTERRUPTS*, any deadlocks of what-have-you
> diff --git a/src/code/hashset.lisp b/src/code/hashset.lisp
> new file mode 100644
> index 000000000..e0ec1fa7e
> --- /dev/null
> +++ b/src/code/hashset.lisp
> @@ -0,0 +1,406 @@
> +;;;; Robinhood-hashing weak hashset
> +;;;; based on
https://cs.uwaterloo.ca/research/tr/1986/CS-86-14.pdf
> +
> +;;;; This software is part of the SBCL system. See the README file for
> +;;;; more information.
> +;;;;
> +;;;; This software is derived from the CMU CL system, which was
> +;;;; written at Carnegie Mellon University and released into the
> +;;;; public domain. The software is in the public domain and is
> +;;;; provided with absolutely no warranty. See the COPYING and CREDITS
> +;;;; files for more information.
> +
> +;;;; See also
https://dspace.mit.edu/bitstream/handle/1721.1/130693/1251799942-MIT.pdf
> +;;;; which has a concurrent implementation
> +
> +(in-package "SB-IMPL")
> +
> +(export '(make-hashset hashset-remove hashset-statistics))
> +
> +(eval-when ()
> + (pushnew :hashset-debug sb-xc:*features*))
> +
> +(define-load-time-global *hashset-print-statistics* nil)
> +
> +;;; TODO: teach GC about these hashsets so that we can do something
> +;;; with address-sensitive (EQ and EQL) hashing.
> +;;; Then we can implement XSET in terms of this hashset.
> +
> +(defstruct (robinhood-hashset-storage
> + (:constructor make-hashset-storage (cells psl-vector hash-vector hv-inexact))
> + (:conc-name hss-)
> + (:copier nil)
> + (:predicate nil))
> + (cells #() :type simple-vector :read-only t)
> + ;; We always store a hash-vector even if inexact (see below)
> + ;; so that we can avoid calling the comparator on definite mismatches.
> + (hash-vector (make-array 0 :element-type '(unsigned-byte 16))
> + :type (simple-array (unsigned-byte 16) (*))
> + :read-only t)
> + ;; Inexact hashes occur when the size of the hash vector exceeds 2^16
> + ;; in which case the stored hash has lost precision in terms of how
> + ;; many storage bins there are. So we need to call the hash function
> + ;; when moving keys around during insert, and when rehashing.
> + (hv-inexact nil :type boolean :read-only t)
> + (psl-vector (make-array 0 :element-type '(unsigned-byte 8))
> + :type (simple-array (unsigned-byte 8) (*))
> + :read-only t))
> +
> +(defstruct (robinhood-hashset (:conc-name hashset-))
> + ;; STORAGE can be swapped atomically so that readers can threadsafely read
> + ;; all the relevant vectors even if there is a writer.
> + ;; It _doesn't_ mean that FIND returns the right answer if writes occur while reading.
> + ;; It _does_ mean that the algorithm won't crash.
> + (storage (missing-arg) :type robinhood-hashset-storage)
> + (hash-function #'error :type (sfunction (t) fixnum))
> + (test-function #'error :type function)
> + (mutex nil :type (or null sb-thread:mutex)))
> +
> +;;; The last few elements in the cell vector are metadata.
> +(defconstant hs-storage-trailer-cells 3)
> +(defmacro hs-cells-capacity (v)
> + `(truly-the index (- (length ,v) ,hs-storage-trailer-cells)))
> +(defmacro hs-cells-mask (v)
> + `(truly-the index (- (length ,v) ,(1+ hs-storage-trailer-cells))))
> +(defmacro hs-cells-gc-epoch (v) `(aref ,v (- (length ,v) 3)))
> +;;; max probe sequence length for these cells
> +(defmacro hs-cells-max-psl (v) `(truly-the fixnum (aref ,v (- (length ,v) 2))))
> +(defmacro hs-cells-n-avail (v) `(truly-the fixnum (aref ,v (- (length ,v) 1))))
> +
> +(defun hashset-cells-load-factor (cells)
> + (let* ((cap (hs-cells-capacity cells))
> + (used (- cap (hs-cells-n-avail cells))))
> + (/ used cap)))
> +(defconstant +hashset-unused-cell+ 0)
> +(defmacro hs-chain-terminator-p (val) `(eq ,val 0))
> +
> +(defun allocate-hashset-storage (capacity weakp)
> + (declare (sb-c::tlab :system))
> + (let* ((vector-type (if weakp
> + (logior (ash sb-vm:vector-weak-flag sb-vm:array-flags-position)
> + sb-vm:simple-vector-widetag)
> + sb-vm:simple-vector-widetag))
> + (len (+ capacity hs-storage-trailer-cells))
> + (cells (fill (truly-the simple-vector
> + (allocate-vector #+ubsan nil vector-type len len))
> + +hashset-unused-cell+))
> + (psl-vector (make-array capacity :element-type '(unsigned-byte 8)
> + :initial-element 0))
> + (hash-vector (make-array capacity :element-type '(unsigned-byte 16))))
> + (setf (hs-cells-max-psl cells) 0)
> + (setf (hs-cells-n-avail cells) capacity)
> + ;; Capacity 65536 is the max for which stored hashes can represent all indices
> + ;; into the cell vector. Beyond that, the hashes don't have the required precision.
> + ;; I might instead want the hash-vector's type to
> + ;; (OR (SIMPLE-ARRAY (UNSIGNED-BYTE 16) (*)) (SIMPLE-ARRAY (UNSIGNED-BYTE 32) (*)))
> + ;; but I'm not going there yet.
> + (make-hashset-storage cells psl-vector hash-vector (> capacity 65536))))
> +
> +(defun hashset-weakp (hashset)
> + (let* ((storage (hashset-storage hashset))
> + (cells (hss-cells storage)))
> + (not (zerop (get-header-data cells)))))
> +
> +(defun make-hashset (log2size test-function hash-function &key weakness synchronized)
> + (declare (boolean weakness synchronized))
> + (let* ((capacity (ash 1 log2size))
> + (storage (allocate-hashset-storage capacity weakness)))
> + (make-robinhood-hashset :hash-function hash-function
> + :test-function test-function
> + :mutex (if synchronized (sb-thread:make-mutex))
> + :storage storage)))
> +
> +;;; The following terms are synonyms: "probe sequence position", "probe sequence index".
> +;;; Probe sequence length is the maximum index attained for a given probe sequence.
> +;;; (Note that probe sequence indices are 1-origin, not 0)
> +;;; The table's overall max-PSL is the maximum of any PSL for any key in the table.
> +(defmacro hs-aver-probe-sequence-len (storage index length)
> + (declare (ignorable storage index length))
> + ;; This assertion is not threadsafe, because an inserter could cause the
> + ;; value in the hash-vector to change by backshifting while we're reading.
> + ;; Readers are otherwise threadsafe. So this is only for debugging.
> + #+nil `(aver (= (aref (hss-psl-vector ,storage) ,index) ,length))
> + )
> +
> +;;; This definition is probably off-by-1 for what you think of as the triangular numbers,
> +;;; but it does what I want for the hashset probing algorithm:
> +;;; f(1)=0, f(2)=1, f(3)=3, f(4)=6, etc
> +;;; So the first probe occurs at index+0, then "skip 1", "skip 2", "skip 3", ...
> +(declaim (inline triangular-number))
> +(defun triangular-number (n)
> + (declare (fixnum n))
> + (/ (* n (1- n)) 2))
> +
> +(defun hss-backshift (hashset storage current-index desired-psp key hash)
> + ;; If we can grab the mutex, then shift this item backwards in its probe sequence.
> + ;; If mutex is already owned, or if after acquiring the mutex a change is detected,
> + ;; just skip this. If no mutex then back-shift is always OK.
> + (let* ((cells (hss-cells storage))
> + (desired-index (logand (+ hash (triangular-number desired-psp))
> + (hs-cells-mask cells)))
> + (mutex (hashset-mutex hashset)))
> + ;; this should probably be WITH-SYSTEM-MUTEX :WAIT NIL
> + ;; but it's hard to structure this exceptional case that way
> + (when (or (not mutex) (sb-thread::%try-mutex mutex))
> + (when (and (eq (hashset-storage hashset) storage)
> + (eq (aref cells current-index) key) ; key is still here
> + (null (aref cells desired-index))) ; tombstone is still there
> + #+nil
> + (format t "~&Hashset moved ~S from psp ~D to ~D~%"
> + key (aref (hss-psl-vector storage) current-index) desired-psp)
> + ;; Put this item into the tombstone's location and make this a tombstone
> + ;; and *not* +hashset-unused-cell+. Any other probing sequence for a different key
> + ;; might have previously tried to claim this cell and could not, so
> + ;; went to a later cell in its sequence. Therefore we can't interrupt that
> + ;; other (unknown) sequence of probes with a chain-terminating marker.
> + ;; Change the key's stored hash to indicate its new, shorter, probe sequence length
> + (setf (aref (hss-hash-vector storage) desired-index) (ldb (byte 16 0) hash)
> + (aref (hss-psl-vector storage) desired-index) desired-psp
> + (aref cells desired-index) key
> + (aref cells current-index) nil))
> + ;; (hashset-check-invariants hashset)
> + (when mutex (sb-thread:release-mutex mutex)))))
> +
> +;;; Algorithm from Figure 2.1 of the paper.
> +;;; TODO: there's a simple optimization to avoid calling TRIANGULAR-NUMBER
> +;;; but I want (for the time being) to be a little closer to the structure
> +;;; of the reference algorithm, though already this deviates somewhat
> +;;; substantially by:
> +;;; - not abstracting out the family of hash functions
> +;;; - not depending on a 'findposition'
> +;;; - storing hashes explicitly
> +(defun hashset-%insert (hashset storage key hash)
> + (flet ((triang (n)
> + (triangular-number (the (unsigned-byte 8) n))))
> + (declare (inline triang))
> + (let* ((probe-sequence-pos 0) ; called 'probeposition' in the paper
> + (cells (hss-cells storage))
> + (max-psl (hs-cells-max-psl cells))
> + (psl-vector (hss-psl-vector storage))
> + (hash-vector (hss-hash-vector storage))
> + (mask (hs-cells-mask cells))
> + (original-key key))
> + (loop
> + (incf probe-sequence-pos)
> + (let* ((location (logand (+ hash (triang probe-sequence-pos)) mask))
> + (probed-key (aref cells location))
> + ;; Get the position in its probe sequence of the key (if any)
> + ;; in the slot we're considering taking.
> + (probed-key-psp ; named 'recordposition' in the paper
> + ;; GC-smashed cells might have a nonzero value in the psl-vector
> + ;; which must be disregarded.
> + (if (null probed-key) 0 (aref psl-vector location))))
> + (when (> probe-sequence-pos probed-key-psp) ; KEY should get the slot
> + ;; Get the hash for the key we're stomping on (if any)
> + ;; before storing our hash
> + (let ((probed-key-hash (aref hash-vector location)))
> + (setf (aref hash-vector location) (ldb (byte 16 0) hash)
> + (aref psl-vector location) probe-sequence-pos
> + (aref cells location) key)
> + #+hashset-debug
> + (when (> probe-sequence-pos max-psl)
> + (format *error-output* "hashset-insert(~x): max_psl was ~D is ~D, LF=~F~%"
> + (get-lisp-obj-address cells)
> + max-psl probe-sequence-pos (hashset-cells-load-factor cells)))
> + (setf max-psl (max max-psl probe-sequence-pos))
> + (when (or (null probed-key) (hs-chain-terminator-p probed-key)) (return))
> + (setq probe-sequence-pos probed-key-psp
> + key probed-key
> + hash (if (hss-hv-inexact storage)
> + (funcall (hashset-hash-function hashset) key)
> + probed-key-hash))))))
> + (setf (hs-cells-max-psl cells) max-psl)
> + original-key)))
> +
> +(defun hashset-statistics (storage &aux (cells (hss-cells storage))
> + (psl-vector (hss-psl-vector storage)))
> + (flet ((validp (x) (and (not (hs-chain-terminator-p x)) x)))
> + (declare (inline validp))
> + (do ((i (hs-cells-mask cells) (1- i))
> + (histo (make-array (hs-cells-max-psl cells) :initial-element 0))
> + (sum-psl 0) ; sum of probe sequence lengths
> + (n-keys 0))
> + ((< i 0)
> + (values (if (plusp n-keys) (/ (float sum-psl) n-keys)) ; avg PSL
> + histo
> + (/ (float n-keys) (hs-cells-capacity cells)))) ; load factor
> + (let ((x (aref cells i)))
> + (when (validp x)
> + (let ((psl (aref psl-vector i)))
> + (incf (aref histo (1- psl)))
> + (incf sum-psl psl)
> + (incf n-keys)))))))
> +
> +(defun hashset-rehash (hashset count)
> + ;; (hashset-check-invariants hashset "begin rehash")
> + (flet ((validp (x) (and (not (hs-chain-terminator-p x)) x)))
> + (declare (inline validp))
> + (let* ((old-storage (hashset-storage hashset))
> + (old-cells (hss-cells old-storage))
> + (old-capacity (hs-cells-capacity old-cells))
> + (count (or count ; count is already known if just GC'ed
> + (count-if #'validp old-cells :end old-capacity)))
> + (new-capacity (max 64 (power-of-two-ceiling (* count 2))))
> + (new-storage
> + (allocate-hashset-storage new-capacity (hashset-weakp hashset))))
> +
> + ;; This can be removed
> + (when *hashset-print-statistics*
> + (multiple-value-bind (mean-psl histo) (hashset-statistics old-storage)
> + (let ((*print-length* nil)
> + (*print-pretty* nil))
> + (format *error-output* "~&rehash: size=(~D -> ~D), ~D avg=~f~%"
> + old-capacity new-capacity histo mean-psl))))
> +
> + (do ((i (1- old-capacity) (1- i))
> + (n-inserted 0)
> + (old-hash-vector (hss-hash-vector old-storage)))
> + ((< i 0)
> + (decf (hs-cells-n-avail (hss-cells new-storage)) n-inserted))
> + (declare (index-or-minus-1 i) (fixnum n-inserted))
> + (let ((key (aref old-cells i)))
> + (when (validp key)
> + (incf n-inserted)
> + ;; Test whether 16-bit hashes are good for the _new_ storage, not the old.
> + (hashset-%insert hashset new-storage key
> + (if (hss-hv-inexact new-storage)
> + (funcall (hashset-hash-function hashset) key)
> + (aref old-hash-vector i))))))
> + ;; Assign the new vectors
> + (setf (hashset-storage hashset) new-storage)
> + ;; Zap the old key vector
> + (fill old-cells 0)
> + (assign-vector-flags old-cells 0) ; old vector becomes non-weak, eliminating some GC overhead
> + ;; (hashset-check-invariants hashset "end rehash")
> + new-storage)))
> +
> +(defun hashset-insert (hashset key)
> + (let* ((storage (hashset-storage hashset))
> + (cells (hss-cells storage))
> + (capacity (hs-cells-capacity cells))
> + (min-avail (ash capacity -2)))
> + (cond ((hashset-weakp hashset)
> + (flet ((validp (x) (and (not (hs-chain-terminator-p x)) x)))
> + (declare (inline validp))
> + (let ((current-epoch sb-kernel::*gc-epoch*)
> + (n-live))
> + ;; First decide if the table occupancy needs to be recomputed after GC
> + (unless (eq (hs-cells-gc-epoch cells) current-epoch)
> + (setf n-live (count-if #'validp cells :end capacity)
> + (hs-cells-n-avail cells) (- capacity n-live)
> + (hs-cells-gc-epoch cells) current-epoch))
> + ;; Next decide if rehash should occur (LF exceeds 75%)
> + ;; TODO: also do the rehash if 50% of cells are NULL
> + (when (< (hs-cells-n-avail cells) min-avail)
> + ;; No big deal if GC culled some more after the counting-
> + ;; REHASH will only copy valid items.
> + (setf storage (hashset-rehash hashset n-live)
> + (hs-cells-gc-epoch (hss-cells storage))
> + current-epoch)))))
> + (t
> + ;; Just look at the occupancy, which has to be accurate
> + (let ((n-avail (hs-cells-n-avail cells)))
> + (when (< n-avail min-avail)
> + (setf storage (hashset-rehash hashset (- capacity n-avail)))))))
> + ;; Finally, insert
> + (decf (hs-cells-n-avail cells))
> + (hashset-%insert hashset storage key (funcall (hashset-hash-function hashset) key))))
> +
> +;;; This is the standard open-addressing algorithm using triangular numbers for successive
> +;;; probes, with early termination based on the observed maximum probe sequence length
> +;;; as maintained by the insertion algorithm.
> +(defun hashset-find (hashset key)
> + (declare (optimize (sb-c::insert-array-bounds-checks 0)))
> + (let* ((storage (hashset-storage hashset))
> + (cells (hss-cells storage))
> + (hash (funcall (hashset-hash-function hashset) key))
> + (mask (hs-cells-mask cells))
> + (hash-vector (hss-hash-vector storage))
> + (test (hashset-test-function hashset))
> + (max-psl-1 (1- (hs-cells-max-psl cells)))
> + ;; Filtering on LOWTAG rejects the unused cell marker as well as NIL
> + ;; stuffed in by GC, except for keys which are lists. It is assumed
> + ;; that the comparator can accept NIL if it accepts lists.
> + (lowtag (lowtag-of key))
> + (clipped-hash (ldb (byte 16 0) hash))
> + (index (logand hash mask))
> + (iteration 1))
> + (declare (fixnum iteration))
> + ;; Unroll by always fetching a pair of keys and hashes.
> + ;; Theory suggests that first probing the 2nd choice location should perform better
> + ;; than probing the 1st choice first, because the probability density function
> + ;; for key K mapping to its Nth-choice probe-sequence-position is more highly
> + ;; concentrated at 2 than 1. Despite that I have not observed that to be always true,
> + ;; in the unrolled loop, this tactic is performed by checking K2 before K1.
> + ;; (It's also not better for subsequent iterations, but it's good enough)
> + (loop
> + (let* ((next-index (logand (+ index iteration) mask))
> + (k1 (aref cells index))
> + (k2 (aref cells next-index))
> + (h1 (aref hash-vector index))
> + (h2 (aref hash-vector next-index)))
> + (when (and (= (lowtag-of k2) lowtag) (= h2 clipped-hash) (funcall test k2 key))
> + (hs-aver-probe-sequence-len storage next-index (1+ iteration))
> + (return k2))
> + (when (and (= (lowtag-of k1) lowtag) (= h1 clipped-hash) (funcall test k1 key))
> + (hs-aver-probe-sequence-len storage index iteration)
> + (return k1))
> + (when (or (hs-chain-terminator-p k1)
> + (hs-chain-terminator-p k2)
> + ;; We've tested through ITERATION+1. If that is >= MAX-PSL we're done.
> + ;; That's the same as checking ITERATION >= (1- MAX-PSL)
> + (>= iteration max-psl-1))
> + (return nil))
> + ;; this visits every cell.
> + ;; Proof at
https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
> + (setq index (logand (+ next-index iteration 1) mask))
> + (incf (truly-the fixnum iteration) 2)))))
> +
> +;;; This is basically FIND, storing NIL in the cell if found.
> +;;; Caller is responsible for guarding with the hashset-mutex if applicable.
> +;;; Return T if KEY was present, NIL otherwise.
> +(defun hashset-remove (hashset key &aux (storage (hashset-storage hashset))
> + (cells (hss-cells storage))
> + (test (hashset-test-function hashset)))
> + (declare (optimize (sb-c::insert-array-bounds-checks 0)))
> + (let* ((mask (hs-cells-mask cells))
> + (index (logand (funcall (hashset-hash-function hashset) key) mask))
> + (max-psl (hs-cells-max-psl cells))
> + (iteration 1))
> + (declare (fixnum iteration))
> + (loop
> + (let ((probed-value (aref cells index)))
> + (when (hs-chain-terminator-p probed-value) ; end of probe sequence
> + (return nil))
> + (when (and probed-value (funcall test probed-value key))
> + (hs-aver-probe-sequence-len storage index iteration)
> + (setf (aref cells index) nil) ; It's that simple
> + (return t))
> + (if (>= iteration max-psl) (return nil))
> + (setq index (logand (+ index iteration) mask))
> + (incf iteration)))))
> +
> +;;; Search for KEY in HASHSET and if found return the matching entry.
> +;;; If not found, call COPIER on KEY and insert that.
> +;;; This operation allows the supplied key to be dynamic-extent or possibly
> +;;; not in GC-managed memory.
> +;;; The hashset is single-reader safe without the mutex, but you might or might not
> +;;; get a hit even if KEY is logically present, because a concurrent INSERT is
> +;;; allowed to reorder the physical storage. So we rely on the double-check pattern.
> +(defun hashset-insert-if-absent (hashset key copier)
> + (or (hashset-find hashset key)
> + (if (not (hashset-mutex hashset))
> + (hashset-insert hashset (funcall copier key))
> + (with-system-mutex ((hashset-mutex hashset))
> + (or (hashset-find hashset key)
> + (hashset-insert hashset (funcall copier key)))))))
> +
> +(defun hashset-count (hashset &aux (cells (hss-cells (hashset-storage hashset))))
> + (count-if (lambda (x) (and x (not (hs-chain-terminator-p x))))
> + cells :end (hs-cells-capacity cells)))
> +(defmethod print-object ((self robinhood-hashset) stream)
> + (print-unreadable-object (self stream :type t :identity t)
> + (format stream "~S ~D/~D keys"
> + (hashset-test-function self)
> + (hashset-count self)
> + (hs-cells-capacity (hss-cells (hashset-storage self))))))
> diff --git a/src/code/target-pathname.lisp b/src/code/target-pathname.lisp
> index 2ed4fa26c..1217103e0 100644
> --- a/src/code/target-pathname.lisp
> +++ b/src/code/target-pathname.lisp
> @@ -272,15 +272,10 @@
> ;;; Also, on case-sensitive-case-preserving filesystems it's not possible
> ;;; to know which pathnames are equivalent without asking the filesystem.
> ;;;
> -;;; The storage portion of the table starts out as having 2^0 cells (a power of 2)
> -;;; The last elements in each vector is not logically a storage cell.
> -(define-load-time-global *pn-dir-cache* #(nil 1))
> -(define-load-time-global *pn-cache* #(nil 1))
> -(declaim (simple-vector *pn-dir-cache* *pn-cache*))
> -(define-load-time-global *pn-cache-force-recount* nil)
> -(define-load-time-global *pn-cache-lock* (sb-thread:make-mutex :name "pathnames"))
> -
> -(defmacro pn-cache-n-available-cells (v) `(aref ,v (1- (length ,v))))
> +;;: TODO: consider similarly interning the DEVICE and TYPE parts
> +(define-load-time-global *pn-dir-table* nil)
> +(define-load-time-global *pn-table* nil)
> +(declaim (type robinhood-hashset *pn-dir-table* *pn-table*))
>
> (defmacro compare-pathname-host/dev/dir/name/type (a b)
> `(and (eq (%pathname-host ,a) (%pathname-host ,b)) ; Interned
> @@ -291,143 +286,19 @@
> (compare-component (%pathname-name ,a) (%pathname-name ,b))
> (compare-component (%pathname-type ,a) (%pathname-type ,b))))
>
> -;;; pathname cache TODOs:
> -;;; (1) Give up on the dream of a lockless algorithm. Weakness is hard enough as it is.
> -;;; (2) Probe without acquiring the mutex. Only acquire the mutex if we see that
> -;;; insertion needs to happen. Then re-probe with the mutex, and if probing
> -;;; fails again, then first consider whether to rehash, and then insert.
> -;;; (the smashing loop on the "old" data should still be OK)
> -;;; (3) Change to Hopscotch hashing to bound the max probes
> -;;; (4) See if the algorithm can be made into a general-purpose hashset.
> -;;; We probably want to enhance the GC to count the number of smashed cells,
> -;;; otherwise we lack a decent strategy for deciding to resize very large tables.
> -;;; Counting while pseudo-atomic doesn't seem the best approach.
> +(defun pn-table-dir= (entry key)
> + (or (eq (car entry) (car key)) ; quick win if lists are EQ
> + (and (eq (cdr entry) (cdr key)) ; hashes match
> + (compare-component (car entry) (car key)))))
> +(defun pn-table-pn= (entry key)
> + (and (compare-pathname-host/dev/dir/name/type entry key)
> + (eql (%pathname-version entry) (%pathname-version key))))
>
> -;;; Current resizing strategy:
> -;;; - bound the load factor but not the number of probes. But do stop at the absolute
> -;;; limit of hitting every possible table cell.
> -;;; - if available cells are few, then rehash. Strive for 75% load factor.
> -;;; - after each GC, recompute the number of available cells.
> -;;; Delay rehash until actually inserting. Just get the available cells right.
> -;;; Available cells include those which were never claimed, as well as
> -;;; those nullfied by GC. The count can always be a little wrong because we're
> -;;; not mutually exclusive with GC while actually counting.
> -;;;
> -(declaim (ftype (function * *) intern-pathname rebuild-pathname-cache)) ; non-toplevel
> -(labels
> - ((update (table-holder thing hash-fn comparator-fn persist-fn)
> - (let* ((table (symbol-value table-holder))
> - (result (probe table thing hash-fn comparator-fn persist-fn)))
> - (or result
> - (probe (rehash table-holder table hash-fn)
> - thing hash-fn comparator-fn persist-fn))))
> - (probe (table thing hash-fn comparator-fn persist-fn)
> - (declare (function hash-fn comparator-fn persist-fn)
> - (simple-vector table))
> - (let* ((hash (funcall hash-fn thing))
> - ;; key storage capacity is 1 less than vector's physical length.
> - ;; The last cell holds the number of available cells.
> - (table-capacity (1- (length table)))
> - (mask (1- table-capacity))
> - (index (logand hash mask))
> - (tombstone)
> - (interval 1))
> - ;; Due to vector weakness, we're always racing with GC but it's OK.
> - (loop
> - (let ((probed-value (aref table index)))
> - ;; Open-addressing can decide that a key is absent only after hitting
> - ;; _all_ cells in its probing sequence, terminated by unbound-marker.
> - ;; Because of that we should try to ensure that there is always at least
> - ;; one unbound-marker in the table at all times. Or limit the max probes
> - ;; to the table capacity, which is done (below).
> - (when (unbound-marker-p probed-value) ; end of probe sequence
> - ;; First decide if we should rehash. We rehash when fewer than
> - ;; 25% of the table is available for use.
> - (when *pn-cache-force-recount*
> - (flet ((recount (table)
> - (declare (simple-vector table))
> - (let* ((capacity (1- (length table)))
> - (n (count-if #'validp table :end capacity)))
> - (setf (pn-cache-n-available-cells table) (- capacity n)))))
> - (recount *pn-dir-cache*)
> - (recount *pn-cache*))
> - (setq *pn-cache-force-recount* nil))
> - (let ((n-available-cells (pn-cache-n-available-cells table))
> - (min-threshold (ash table-capacity -2))) ; = capacity / 4
> - (return (cond ((> n-available-cells min-threshold)
> - ;; doesn't matter which kind of available cell we take
> - (decf (pn-cache-n-available-cells table))
> - (setf (aref table (or tombstone index))
> - (funcall persist-fn thing)))))))
> - (when (and probed-value (funcall comparator-fn probed-value thing))
> - (when tombstone
> - ;; Put this item into the tombstone's location and make this a tombstone
> - ;; and *not* unbound-marker. Any other probing sequence for a different key
> - ;; might have previously tried to claim this cell and could not, so
> - ;; went to a later cell in its sequence. Therefore we can't interrupt that
> - ;; other (unknown) sequence of probes with a chain-terminating marker.
> - (setf (aref table tombstone) probed-value
> - (aref table index) nil))
> - (return probed-value))
> - ;; Prevent infinite loop if the next index to probe is the same modulo N.
> - ;; I'd like to remove this extra test, and instead enforce that we don't
> - ;; clobber the last remaining unbound-marker.
> - ;; Or implement a better probing strategy as suggested in the TODOs.
> - (if (= interval table-capacity) (return nil))
> - (when (and (null probed-value) (not tombstone)) (setq tombstone index))
> - ;; this visits every cell.
> - ;; Proof at
https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
> - (setq index (logand (+ index interval) mask))
> - (incf interval)))))
> - ;; VALIDP is the same as CONSP on the vector of directories, or PATHNAMEP
> - ;; (or even just %INSTANCEP) on the vector of pathnames.
> - ;; This one predicate can work for either vector.
> - (validp (x) (and (not (unbound-marker-p x)) x))
> - (rehash (table-holder table hash-fn)
> - (declare (simple-vector table) (function hash-fn))
> - (declare (sb-c::tlab :system))
> - ;; up-size, aiming for 50% load
> - ;; No big deal if GC decides to cull more after the counting step.
> - (let* ((n-live (count-if #'validp table))
> - (logical-size (max 64 (power-of-two-ceiling (* n-live 2))))
> - (physical-size (1+ logical-size))
> - (new (let ((vector-type
> - (logior (ash sb-vm:vector-weak-flag sb-vm:array-flags-position)
> - sb-vm:simple-vector-widetag)))
> - (fill (allocate-vector #+ubsan nil vector-type physical-size physical-size)
> - (make-unbound-marker)))))
> - (setf (pn-cache-n-available-cells new) logical-size)
> - (do ((i (- (length table) 2) (1- i)))
> - ((< i 0))
> - (declare (index-or-minus-1 i))
> - (let ((x (aref table i)))
> - (when (validp x)
> - (probe new x hash-fn #'constantly-nil #'identity))))
> - (set table-holder new)
> - (assign-vector-flags table 0) ; old vector is non-weak, eliminating some GC overhead
> - (fill table 0)
> - new))
> - (dir-matchp (entry key)
> - (or (eq (car entry) (car key)) ; quick win if lists are EQ
> - (and (eq (cdr entry) (cdr key)) ; hashes match
> - (compare-component (car entry) (car key)))))
> - (pn-matchp (entry key)
> - (and (compare-pathname-host/dev/dir/name/type entry key)
> - (eql (%pathname-version entry) (%pathname-version key))))
> - (ensure-heap-string (part) ; return any non-string as-is
> - (declare (sb-c::tlab :system))
> - ;; FIXME: what about pattern pieces?
> - (cond ((or (not (stringp part)) (read-only-space-obj-p part)) part)
> - ((dynamic-space-obj-p part) (logically-readonlyize part))
> - ;; dynamic-extent strings and lisp strings in alien memory are acceptable.
> - ;; Copies must be made in that case, since we're holding on to them.
> - (t (let ((l (length part)))
> - (logically-readonlyize
> - (replace (typecase part
> - (base-string (make-string l :element-type 'base-char))
> - (t (make-string l)))
> - part)))))))
> - (declare (inline validp))
> +(defun !pathname-cold-init ()
> + (setq *pn-dir-table* (make-hashset 3 #'pn-table-dir= #'cdr
> + :synchronized t :weakness t)
> + *pn-table* (make-hashset 3 #'pn-table-pn= #'pathname-sxhash
> + :synchronized t :weakness t)))
>
> ;;; A pathname is logical if the host component is a logical host.
> ;;; This constructor is used to make an instance of the correct type
> @@ -438,6 +309,7 @@
> ;; but the arguments given in the X3J13 cleanup issue
> ;; PATHNAME-LOGICAL:ADD seem compelling: we should canonicalize the
> ;; case, and uppercase is the ordinary way to do that.
> + (declare (sb-c::tlab :system))
> (flet ((upcase-maybe (x) (typecase x (string (logical-word-or-lose x)) (t x))))
> (when (typep host 'logical-host)
> (setq device :unspecific
> @@ -445,50 +317,61 @@
> name (upcase-maybe name)
> type (upcase-maybe type))))
> (dx-let ((dir-key (cons directory (pathname-sxhash directory))))
> - (declare (sb-c::tlab :system))
> (declare (inline !allocate-pathname)) ; for DX-allocation
> - (with-system-mutex (*pn-cache-lock*)
> + (flet ((ensure-heap-string (part) ; return any non-string as-is
> + ;; FIXME: what about pattern pieces and (:HOME "user") ?
> + (cond ((or (not (stringp part)) (read-only-space-obj-p part)) part)
> + ((dynamic-space-obj-p part) (logically-readonlyize part))
> + ;; dynamic-extent strings and lisp strings in alien memory are acceptable.
> + ;; Copies must be made in that case, since we're holding on to them.
> + (t (let ((l (length part)))
> + (logically-readonlyize
> + (replace (typecase part
> + (base-string (make-string l :element-type 'base-char))
> + (t (make-string l)))
> + part)))))))
> (let* ((dir+hash
> (if directory ; find the interned dir-key
> - (update '*pn-dir-cache* dir-key #'cdr #'dir-matchp
> - (lambda (dir)
> - (cons (mapcar #'ensure-heap-string (car dir)) (cdr dir))))))
> + (hashset-insert-if-absent
> + *pn-dir-table* dir-key
> + (lambda (dir)
> + (cons (mapcar #'ensure-heap-string (car dir)) (cdr dir))))))
> (pn-key (!allocate-pathname host device dir+hash name type version)))
> (declare (truly-dynamic-extent pn-key))
> - (the pathname ; assert type-correct answer
> - (update '*pn-cache* pn-key #'pathname-sxhash #'pn-matchp
> - ;; don't capture the original args, so no extra closure consing
> - (lambda (tmp &aux (host (%pathname-host tmp)))
> - (let ((new (!allocate-pathname
> - host (%pathname-device tmp)
> - (%pathname-dir+hash tmp)
> - (ensure-heap-string (%pathname-name tmp))
> - (ensure-heap-string (%pathname-type tmp))
> - (%pathname-version tmp))))
> - (when (typep host 'logical-host)
> - (setf (%instance-wrapper new) #.(find-layout 'logical-pathname)))
> - new))))))))
> + (hashset-insert-if-absent
> + *pn-table* pn-key
> + (lambda (tmp &aux (host (%pathname-host tmp)))
> + (let ((new (!allocate-pathname
> + host (%pathname-device tmp)
> + (%pathname-dir+hash tmp)
> + (ensure-heap-string (%pathname-name tmp))
> + (ensure-heap-string (%pathname-type tmp))
> + (%pathname-version tmp))))
> + (when (typep host 'logical-host)
> + (setf (%instance-wrapper new) #.(find-layout 'logical-pathname)))
> + new)))))))
>
> ;;; Weak vectors don't work at all once rendered pseudo-static.
> ;;; so in order to weaken the pathname cache, the vectors are copied on restart.
> ;;; It may not achieve anything for saved pathnames, since the vector elements
> ;;; are themselves pseudo-static, but at least newly made ones aren't immortal.
> (defun rebuild-pathname-cache ()
> - (rehash '*pn-dir-cache* *pn-dir-cache* #'cdr)
> - (rehash '*pn-cache* *pn-cache* #'pathname-sxhash))
> -) ; end LABELS
> + (hashset-rehash *pn-dir-table* nil)
> + (hashset-rehash *pn-table* nil))
>
> (defun show-pn-cache (&aux (*print-pretty* nil) (*package* (find-package "CL-USER")))
> (without-gcing
> - (dolist (symbol '(*pn-dir-cache* *pn-cache*))
> - (let* ((v (symbol-value symbol))
> - (n (1- (length v))))
> + (dolist (symbol '(*pn-dir-table* *pn-table*))
> + (let* ((hashset (symbol-value symbol))
> + (dirs (hss-cells (hashset-storage *pn-dir-table*)))
> + (v (hss-cells (hashset-storage hashset)))
> + (n (hs-cells-capacity v)))
> (format t "~&~S: size=~D tombstones=~D unused=~D~%" symbol n
> (count nil v :end n) (count-if #'unbound-marker-p v :end n))
> (dotimes (i n)
> (let ((entry (aref v i)))
> (unless (or (unbound-marker-p entry) (null entry))
> - (if (eq symbol '*pn-dir-cache*)
> + (if (eq symbol '*pn-dir-table*)
> (format t "~4d ~3d ~x ~16x ~s~%" i
> (generation-of entry) (get-lisp-obj-address entry)
> (cdr entry) (car entry))
> @@ -502,7 +385,7 @@
> (t host)))
> (%pathname-device entry)
> (acond ((%pathname-dir+hash entry)
> - (format nil "@~D" (position it *pn-dir-cache*)))
> + (format nil "@~D" (position it dirs)))
> (t "-"))
> (%pathname-name entry)
> (%pathname-type entry)
> @@ -697,6 +580,8 @@
> (symbol (sxhash piece)) ; transformed
> (pattern (pattern-hash piece))
> ((cons (eql :home) (cons string null))
> + ;; :HOME has two representations- one is just '(:absolute :home ...)
> + ;; and the other '(:absolute (:home "user") ...)
> (sxhash (second piece))))))
> (etypecase x
> (pathname
> @@ -2111,7 +1996,7 @@ experimental and subject to change."
> :version :newest))
> (read lpt))))))
>
> -(defun !pathname-cold-init ()
> +(defun !lpn-cold-init ()
> (let* ((sys *default-pathname-defaults*)
> (src
> (merge-pathnames
> diff --git a/src/cold/build-order.lisp-expr b/src/cold/build-order.lisp-expr
> index 7740caf3a..be42095c4 100644
> --- a/src/cold/build-order.lisp-expr
> +++ b/src/cold/build-order.lisp-expr
> @@ -547,6 +547,7 @@
>
> ("src/code/target-defstruct" :not-host)
> ("src/code/target-stream" :not-host) ; needs WHITESPACEP from "code/reader"
> + ("src/code/hashset" :not-host)
> ("src/code/target-pathname" :not-host)
> #-win32 ("src/code/unix-pathname" :not-host)
> #+win32 ("src/code/win32-pathname" :not-host)
> diff --git a/src/pcl/defs.lisp b/src/pcl/defs.lisp
> index ba1f25bf2..c40d146ca 100644
> --- a/src/pcl/defs.lisp
> +++ b/src/pcl/defs.lisp
> @@ -364,6 +364,8 @@
> (defclass method-combination (metaobject)
> ((%documentation :initform nil :initarg :documentation)))
>
> +;;; TODO: we can express MAKE-GF-HASHSET in terms of the robinhood hashset
> +;;; using the same stable FSC-INSTANCE-HASH as this.
> (defun make-gf-hashset ()
> ;; Return what is logically a weak hashset, but physically a weak hash-table
> ;; because we don't implement hashsets.
> diff --git a/tests/hashset.pure.lisp b/tests/hashset.pure.lisp
> new file mode 100644
> index 000000000..2a581519e
> --- /dev/null
> +++ b/tests/hashset.pure.lisp
> @@ -0,0 +1,146 @@
> +
> +(import '(sb-impl::hashset-storage
> + sb-impl::hashset-hash-function
> + sb-impl::hashset-test-function
> + sb-impl::hss-cells
> + sb-impl::hss-hash-vector
> + sb-impl::hss-psl-vector
> + sb-impl::hashset-find))
> +
> +(defun hs-cells-mask (v) (- (length v) 4))
> +(defun hs-chain-terminator-p (x) (eq x 0))
> +
> +(defun hashset-probing-sequence (hashset key)
> + (let* ((storage (hashset-storage hashset))
> + (cells (hss-cells storage))
> + (mask (hs-cells-mask cells))
> + (index (logand (funcall (hashset-hash-function hashset) key) mask))
> + (interval 1)
> + (sequence))
> + (loop
> + (push index sequence)
> + (let ((probed-key (aref cells index)))
> + (assert (not (hs-chain-terminator-p probed-key)))
> + (when (and probed-key (funcall (hashset-test-function hashset) probed-key key))
> + (return (nreverse sequence)))
> + (setq index (logand (+ index interval) mask))
> + (incf interval)))))
> +
> +(defun hashset-check-invariants (hashset)
> + (let* ((storage (hashset-storage hashset))
> + (cells (hss-cells storage))
> + (hashes (hss-hash-vector storage))
> + (psl (hss-psl-vector storage)))
> + (assert (= (length hashes) (length psl)))
> + (when (> (length hashes) 65536)
> + (assert (sb-impl::hss-hv-inexact (hashset-storage hashset))))
> + (dotimes (i (length hashes))
> + (let ((key (aref cells i)))
> + (when (and key (not (hs-chain-terminator-p key)))
> + ;; For each key, the stored hash should be correct
> + (assert (= (ldb (byte 16 0) (funcall (hashset-hash-function hashset) key))
> + (aref hashes i)))
> + (let ((sequence (hashset-probing-sequence hashset key)))
> + ;; And the stored probing-sequence-length should match the actual
> + (unless (= (aref psl i) (length sequence))
> + (error "Wrong at ~S: sequence=~S stored=~S"
> + key sequence (aref psl i)))))))))
> +
> +(defun make-string-hashset (case-sensitive-p)
> + (if case-sensitive-p
> + (sb-impl::make-hashset 4 #'string= #'sb-kernel:%sxhash-simple-string)
> + (sb-impl::make-hashset 4 #'string-equal #'sb-impl::psxhash)))
> +
> +;; HASHSET DOES NOT ALLOW INSERTING AN EXISTING KEY.
> +;; IT WILL VIOLATE INVARIANTS, BUT IT DOES NOT CHECK FOR IT.
> +(defun insert-all-into-hashset (hashset strings existsp-check)
> + (let ((n 0))
> + (dolist (string strings)
> + (when (or (not existsp-check)
> + (not (hashset-find hashset string)))
> + (sb-impl::hashset-insert hashset string)
> + (when (zerop (mod (incf n) 1000))
> + (hashset-check-invariants hashset)
> + (multiple-value-bind (mean-psl histogram load-factor)
> + (sb-impl::hashset-statistics (hashset-storage hashset))
> + ;; (format t "~,4f ~7,4f ~s~%" load-factor mean-psl histogram)
> + (assert (<= load-factor .75))
> + (assert (< mean-psl 3))
> + ;; this is a bit of a "change detector" but I hope it remains correct for a while
> + (assert (< (length histogram) 10))))))
> + (hashset-check-invariants hashset)
> + hashset))
> +
> +(defparameter *lottastrings*
> + (let ((h (make-hash-table :test #'equal)))
> + (dolist (p (list-all-packages))
> + (flet ((add-symbols (table)
> + (sb-int:dovector (symbol (sb-impl::package-hashtable-cells table))
> + (when (symbolp symbol)
> + (setf (gethash (string symbol) h) t)
> + (setf (gethash (string-downcase symbol) h) t)
> + (setf (gethash (reverse (string symbol)) h) t)))))
> + (add-symbols (sb-impl::package-internal-symbols p))
> + (add-symbols (sb-impl::package-external-symbols p))))
> + (loop for k being each hash-key of h collect k)))
> +
> +(defun insert-all-into-hash-table (strings weakness &optional (test 'equal))
> + (let ((hash-table (make-hash-table :test test :weakness weakness)))
> + (dolist (string strings hash-table)
> + (setf (gethash string hash-table) t))))
> +
> +(defparameter *ht0* (insert-all-into-hash-table *lottastrings* nil))
> +;(defparameter *ht1* (insert-all-into-hash-table *lottastrings* :key))
> +;(defparameter *ht2* (insert-all-into-hash-table *lottastrings* :value))
> +
> +(defparameter *hs* (insert-all-into-hashset (make-string-hashset t) *lottastrings* nil))
> +
> +(defun read-all-from-hash-table (strings hash-table ntimes &aux (result 0))
> + (declare (fixnum ntimes result))
> + (dotimes (i ntimes result)
> + (dolist (string strings)
> + (when (gethash string hash-table) (incf result)))))
> +(defun read-all-from-hashset (strings hashset ntimes &aux (result 0))
> + (declare (fixnum ntimes result))
> + (dotimes (i ntimes result)
> + (dolist (string strings)
> + (when (hashset-find hashset string) (incf result)))))
> +
> +(with-test (:name :string-hashset)
> + (assert (= (read-all-from-hash-table *lottastrings* *ht0* 1)
> + (read-all-from-hashset *lottastrings* *hs* 1))))
> +
> +(with-test (:name :case-insensitive-string-hashset)
> + (let ((ht (insert-all-into-hash-table *lottastrings* nil 'equalp))
> + (hs (insert-all-into-hashset (make-string-hashset nil)
> + *lottastrings* t))) ; check existence before inserting
> + (assert (= (read-all-from-hash-table *lottastrings* ht 1)
> + (read-all-from-hashset *lottastrings* hs 1)))))
> +
> +;(format t "~&Timing weak hash-table, KEY weak:~%")
> +;(time (read-all-from-hash-table *lottastrings* *ht1* 20))
> +;(format t "~&Timing weak hash-table, VALUE weak:~%")
> +;(time (read-all-from-hash-table *lottastrings* *ht2* 20))
> +;(format t "~&Timing ordinary hash-table:~%")
> +;(time (read-all-from-hash-table *lottastrings* *ht3* 20))
> +;
> +;(format t "~&Timing hash-set:~%")
> +;(time (read-all-from-hashset *lottastrings* *hs* 20))
> +
> +
> +#|
> +(defun hs-check-loop-unroll (&optional (start-index 0) (mask #xff) (count 10))
> + (format t "~®ular way:~%")
> + (let ((index start-index) (iteration 1))
> + (dotimes (i count (terpri))
> + (format t " ~D" index)
> + (setq index (logand (+ index iteration) mask))
> + (incf iteration)))
> + (format t "~&Unrolled 2x:~%")
> + (let ((index start-index) (iteration 1))
> + (dotimes (i (ceiling count 2) (terpri))
> + (let ((next-index (logand (+ index iteration) mask)))
> + (format t " ~D ~D" index next-index)
> + (setq index (logand (+ next-index iteration 1) mask))
> + (incf iteration 2)))))
> +|#
> diff --git a/tests/pathnames.impure.lisp b/tests/pathnames.impure.lisp
> index a666d07dd..ce47dd25b 100644
> --- a/tests/pathnames.impure.lisp
> +++ b/tests/pathnames.impure.lisp
> @@ -17,11 +17,6 @@
>
> ;;;; Pathname accessors
>
> -(defun nuke-pathname-cache ()
> - ;; This is dangerous because it potentially breaks our invariant
> - ;; that EQUAL pathnames are EQ. Of course, that's not actually enforced yet.
> - (fill sb-impl::*pn-cache* nil))
> -
> (with-test (:name (pathname :accessors :stream-not-associated-to-file type-error))
> (let ((*stream* (make-string-output-stream)))
> (declare (special *stream*))
> @@ -930,7 +925,9 @@
> (((make-pathname) (make-pathname :version :newest)) t))
> (let ((s "#p\"sys:contrib/f[1-9].txt\""))
> (let ((a (read-from-string s)))
> - (nuke-pathname-cache) ; result shouldn't depend on the pathnames being EQ
> + ;; FIXME: maybe this test is bogus now?
> + ;; ("result shouldn't depend on the pathnames being EQ")
> + (sb-impl:hashset-remove sb-impl::*pn-table* a)
> (let ((b (read-from-string s)))
> (assert (not (eq a b)))
> (equalp a b)))))
> @@ -945,7 +942,7 @@
> (with-test (:name :wild-pathnames-string-based-hash)
> (let ((string "#p\"file[0-9].txt\""))
> (let ((a (read-from-string string)))
> - (nuke-pathname-cache)
> + (sb-impl:hashset-remove sb-impl::*pn-table* a)
> (let ((b (read-from-string string)))
> (assert (not (eq a b)))
> (assert (eql (sxhash a) (sxhash b)))))))
> @@ -976,88 +973,3 @@
>
> (with-test (:name :dx-pathname-parts-dont-crash)
> (prin1 (pathname-peristence-test)))
> -
> -(import '(sb-impl::%pathname-host
> - sb-impl::%pathname-dir+hash
> - sb-impl::%pathname-device
> - sb-impl::%pathname-name
> - sb-impl::%pathname-type
> - sb-impl::%pathname-version
> - sb-impl::compare-component))
> -
> -;;; This is the comparator used for the cache, which is _more_ _stringent_ than
> -;;; PATHNAME=. Obviously the regression test has to use the right comparator.
> -;;; e.g. version :NEWEST and version NIL are the same according to PATHNAME=
> -;;; but not the cache. Honestly though, should the cache distinguish entries that
> -;;; are by definition supposed to behave indistinguishably?
> -(defun pn-cache-line= (a b)
> - (and (eq (%pathname-host a) (%pathname-host b)) ; Interned
> - (eq (%pathname-dir+hash a) (%pathname-dir+hash b))
> - (compare-component (%pathname-device a) (%pathname-device b))
> - (compare-component (%pathname-name a) (%pathname-name b))
> - (compare-component (%pathname-type a) (%pathname-type b))
> - (eql (%pathname-version a) (%pathname-version b))))
> -
> -(defun probing-sequence (table pathname)
> - (let* ((mask (- (length table) 2))
> - (index (logand (sb-impl::pathname-sxhash pathname) mask))
> - (interval 1)
> - (sequence))
> - (loop
> - (push index sequence)
> - (let ((probed-value (aref table index)))
> - (when (sb-int:unbound-marker-p probed-value) (error "Can't hapepn"))
> - (when (and probed-value (pn-cache-line= probed-value pathname))
> - (return (nreverse sequence)))
> - (setq index (logand (+ index interval) mask))
> - (incf interval)))))
> -
> -;;; a GC here helps this test pass reliably in two ways:
> -;;; 1. by ensuring that the PN cache is reasonably devoid of entries
> -;;; so that it doesn't size up, and thereby reduce the chance of producing
> -;;; collisions (beacuse this tests *needs* collisions)
> -;;; 2. avoiding GCing just after inserting the 20 testfile pathnames
> -;;; which would blow them away, leaving nothing at all to test
> -(gc)
> -;; recall that MAKE-PATHNAME is unsafely-flushable
> -(dotimes (i 20) (opaque-identity (make-pathname :name (write-to-string i) :type "testfile")))
> -;; Only fake GC-nullifying an entry that was added specifically for the test
> -(defun can-safely-remove-entry (table index)
> - (equal (pathname-type (aref table index)) "testfile"))
> -(defun find-worst-pn-cache-entry (&optional print &aux (worst '(0)))
> - (let ((cache sb-impl::*pn-cache*))
> - (dotimes (i (length cache) (cdr worst))
> - (let ((entry (aref cache i)))
> - (when (pathnamep entry)
> - (let ((sequence (probing-sequence cache entry)))
> - (when print (format t "~s -> ~s~%" entry sequence))
> - (when (and (equal (pathname-type entry) "testfile")
> - (> (length sequence) (car worst))
> - (some (lambda (x) (can-safely-remove-entry cache x))
> - (butlast sequence)))
> - (setq worst (cons (length sequence) sequence)))))))))
> -
> -(with-test (:name :pn-cache-hit-sequence-shorten)
> - (let ((sequence (find-worst-pn-cache-entry)))
> - (unless sequence
> - (error "Expect at least 1 pn-cache collision. Can't test"))
> - (let* ((cache sb-impl::*pn-cache*)
> - (my-index (car (last sequence)))
> - (pathname (aref cache my-index))
> - (index-to-remove
> - (find-if (lambda (x) (can-safely-remove-entry cache x))
> - sequence))
> - (position-of-index (position index-to-remove sequence)))
> - #+nil
> - (format t "~&Probe for ~S -> ~S, will cull index ~D (~:R position)~%"
> - pathname sequence index-to-remove (1+ position-of-index))
> - (setf (aref cache index-to-remove) nil) ; pretend GC did this
> - ;; seek PATHNAME in the cache again
> - (opaque-identity (make-pathname :name (pathname-name pathname) :type "testfile"))
> - (let ((new-sequence (probing-sequence cache pathname)))
> - ;; I don't know a good black-box test on the probing algorithm, so just
> - ;; assert that the old location of PATHNAME now holds a tombstone.
> - (assert (null (aref cache my-index)))
> - ;; should have been found in fewer probes
> - (assert (equal (subseq sequence 0 (1+ position-of-index))
> - new-sequence))))))
> diff --git a/tests/run-tests.lisp b/tests/run-tests.lisp
> index 4bcdec12e..6cb50e8b2 100644
> --- a/tests/run-tests.lisp
> +++ b/tests/run-tests.lisp
> @@ -203,9 +203,8 @@
> sb-impl::*token-buf-pool*
> sb-impl::*user-hash-table-tests*
> sb-impl::**finalizer-store**
> - sb-impl::*pn-dir-cache*
> - sb-impl::*pn-cache*
> - sb-impl::*pn-cache-force-recount*
> + sb-impl::*pn-dir-table*
> + sb-impl::*pn-table*
> sb-vm::*immobile-codeblob-tree*
> sb-vm::*dynspace-codeblob-tree*
> ,(maybe "SB-KERNEL" "*EVAL-CALLS*")
>
> -----------------------------------------------------------------------
>
>
> hooks/post-receive
> --
> SBCL
>
>
> _______________________________________________
> Sbcl-commits mailing list
>
Sbcl-c...@lists.sourceforge.net
>
https://lists.sourceforge.net/lists/listinfo/sbcl-commits
_______________________________________________
Sbcl-devel mailing list
Sbcl-...@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/sbcl-devel