changeset 44: |
99d4ab4f8d53 |
parent: |
81b7333f27f8
|
author: |
Richard Westhaver <ellis@rwest.io> |
date: |
Sun, 11 Aug 2024 01:50:18 -0400 |
permissions: |
-rw-r--r-- |
description: |
update |
1 ;;; examples/mbdump.lisp --- Prepare a sampling of mbdump JSON data 7 ;; - considering sampling 'releases.json' only. could be a really good 8 ;; benchmark. For now we will sample all files. Soon, we may split 9 ;; releases.json into separate files here which is rather trivial 12 ;; - using uiop:read-file-line is NOT the right thing to do. This is 13 ;; too bad because I implemented a specialized stream class and then 14 ;; deleted it before committing. 16 ;; - there are two possible solutions I can think of: 18 ;; - single-pass :: for each file, read the first line and calculate 19 ;; the minimal space needed to store a json object in a single 20 ;; line. Instead of incrementing over every character to find the 21 ;; next line, we move the position once by the minimum space, then 22 ;; iterate over characters until we find a newline. We walk the 23 ;; entire file and pick up the random indexes. 25 ;; - double-pass :: for each file, read each line character by 26 ;; character, counting new lines. At each random index calculate 27 ;; and collect the file position. Do a second pass which sets the 28 ;; file position on each iteration before reading a line. 31 #-prelude (ql:quickload :prelude) 33 (:use :cl :std :log :sb-thread :sb-concurrency :dat/json :cli/clap :obj/time :sb-gray) 34 (:export :main :*target*)) 38 ;; Ultimately we dump the samples to this directory. It should be 39 ;; roughly 1/10th the original size. 40 #| (in-readtable :shell) 41 du -sh data/mbdump # 242G 42 du -sh /tmp/mbdump # 24G 44 (defvar *mbdump-directory* (pathname "/mnt/y/data/packy/data/mbdump-full/")) 46 (defun init-mbdump-files (&optional (dir *mbdump-directory*)) 47 "Count the total number of lines in each file under DIR. Return a 48 hash-table containing filenames->line counts. 50 This is single-threaded so it does take some time on the full mbdump 51 dataset. If you run this make sure to assign the resulting value to 52 *MBDUMP-FILES*, otherwise use the pre-compiled value." 53 (let ((files (find-files dir)) 54 (table (make-hash-table :test 'equal))) 56 (setf (gethash (file-namestring f) table) (count-file-lines f))) 60 (defvar *mbdump-files* (let ((pairs '(("area.json" . 119164) 61 ("artist.json" . 2345810) 62 ("event.json" . 78896) 63 ("instrument.json" . 1046) 64 ("label.json" . 271609) 65 ("place.json" . 63772) 66 ("recording.json" . 119575) 67 ("release-group.json" . 3204634) 68 ("release.json" . 4111554) 69 ("series.json" . 23376) 70 ("work.json" . 2078152))) 71 (table (make-hash-table :test 'equal))) 72 (dolist (pair pairs table) 73 (setf (gethash (car pair) table) (cdr pair))))) 75 (defvar *target-directory* (pathname (concatenate 'string "/tmp/mbdump-" (file-date) "/"))) 79 (defun random-line-indexes (max &optional (count 1000)) 80 (declare (fixnum max count)) 82 (labels ((%gen () (let ((int (random max))) 83 (when (zerop int) (setf int 1)) 88 (dotimes (i count ret) 89 (setf ret (cons (%gen) ret))) 92 (defun prep-json-file (file) 93 (let* ((in-path (merge-pathnames file *mbdump-directory*)) 94 (out-path (merge-pathnames file *target-directory*)) 95 (max (gethash (namestring file) *mbdump-files*)) 96 (count (floor max 10)) 97 (lines (random-line-indexes (gethash (namestring file) *mbdump-files*))) 98 (res (cons out-path count))) 99 (with-open-files ((out out-path :direction :output :external-format '(:utf-8 :replacement "?")) 100 (in in-path :direction :input :external-format '(:utf-8 :replacement "?"))) 102 with line = (uiop:read-file-line in :at i) 103 do (print (file-position in)) 104 do (write-line line out))) 105 (push res *target*))) 108 (ensure-directories-exist *target-directory*) 110 (dolist (file (hash-table-keys *mbdump-files*) workers) 111 (push (make-thread (lambda () (prep-json-file file)) :name (format nil "~A prep" file)) workers)) 112 (time (wait-for-threads workers)))) 114 ;; (prep-json-file "label.json")