changelog shortlog graph tags branches changeset files file revisions raw help

Mercurial > demo / annotate examples/mbdump.lisp

changeset 44: 99d4ab4f8d53
parent: 81b7333f27f8
author: Richard Westhaver <ellis@rwest.io>
date: Sun, 11 Aug 2024 01:50:18 -0400
permissions: -rw-r--r--
description: update
41
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
1
 ;;; examples/mbdump.lisp --- Prepare a sampling of mbdump JSON data
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
2
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
3
 ;; WIP
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
4
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
5
 ;;; Commentary:
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
6
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
7
 ;; - considering sampling 'releases.json' only. could be a really good
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
8
 ;;   benchmark. For now we will sample all files. Soon, we may split
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
9
 ;;   releases.json into separate files here which is rather trivial
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
10
 ;;   anyway.
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
11
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
12
 ;; - using uiop:read-file-line is NOT the right thing to do. This is
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
13
 ;;   too bad because I implemented a specialized stream class and then
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
14
 ;;   deleted it before committing.
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
15
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
16
 ;; - there are two possible solutions I can think of:
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
17
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
18
 ;;   - single-pass :: for each file, read the first line and calculate
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
19
 ;;     the minimal space needed to store a json object in a single
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
20
 ;;     line. Instead of incrementing over every character to find the
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
21
 ;;     next line, we move the position once by the minimum space, then
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
22
 ;;     iterate over characters until we find a newline. We walk the
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
23
 ;;     entire file and pick up the random indexes.
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
24
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
25
 ;;   - double-pass :: for each file, read each line character by
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
26
 ;;     character, counting new lines. At each random index calculate
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
27
 ;;     and collect the file position. Do a second pass which sets the
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
28
 ;;     file position on each iteration before reading a line.
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
29
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
30
 ;;; Code:
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
31
 #-prelude (ql:quickload :prelude)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
32
 (defpackage :mbdump
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
33
   (:use :cl :std :log :sb-thread :sb-concurrency :dat/json :cli/clap :obj/time :sb-gray)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
34
   (:export :main :*target*))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
35
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
36
 (in-package :mbdump)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
37
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
38
 ;; Ultimately we dump the samples to this directory. It should be
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
39
 ;; roughly 1/10th the original size.
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
40
 #| (in-readtable :shell)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
41
 du -sh data/mbdump # 242G
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
42
 du -sh /tmp/mbdump # 24G
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
43
 |#
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
44
 (defvar *mbdump-directory* (pathname "/mnt/y/data/packy/data/mbdump-full/"))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
45
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
46
 (defun init-mbdump-files (&optional (dir *mbdump-directory*))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
47
   "Count the total number of lines in each file under DIR. Return a
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
48
 hash-table containing filenames->line counts.
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
49
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
50
 This is single-threaded so it does take some time on the full mbdump
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
51
 dataset. If you run this make sure to assign the resulting value to
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
52
 *MBDUMP-FILES*, otherwise use the pre-compiled value."
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
53
   (let ((files (find-files dir))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
54
         (table (make-hash-table :test 'equal)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
55
     (mapc (lambda (f)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
56
             (setf (gethash (file-namestring f) table) (count-file-lines f)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
57
           files)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
58
     table))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
59
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
60
 (defvar *mbdump-files* (let ((pairs '(("area.json" . 119164)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
61
                                       ("artist.json" . 2345810)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
62
                                       ("event.json" . 78896)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
63
                                       ("instrument.json" . 1046)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
64
                                       ("label.json" . 271609)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
65
                                       ("place.json" . 63772)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
66
                                       ("recording.json" . 119575)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
67
                                       ("release-group.json" . 3204634)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
68
                                       ("release.json" . 4111554)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
69
                                       ("series.json" . 23376)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
70
                                       ("work.json" . 2078152)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
71
                              (table (make-hash-table :test 'equal)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
72
                          (dolist (pair pairs table)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
73
                            (setf (gethash (car pair) table) (cdr pair)))))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
74
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
75
 (defvar *target-directory* (pathname (concatenate 'string "/tmp/mbdump-" (file-date) "/")))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
76
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
77
 (defvar *target* nil)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
78
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
79
 (defun random-line-indexes (max &optional (count 1000))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
80
   (declare (fixnum max count))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
81
   (let ((ret))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
82
     (labels ((%gen () (let ((int (random max)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
83
                         (when (zerop int) (setf int 1))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
84
                         (if (find int ret)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
85
                             (%gen)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
86
                             int))))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
87
       (sort 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
88
        (dotimes (i count ret)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
89
          (setf ret (cons (%gen) ret)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
90
        #'<))))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
91
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
92
 (defun prep-json-file (file)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
93
   (let* ((in-path (merge-pathnames file *mbdump-directory*))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
94
          (out-path (merge-pathnames file *target-directory*))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
95
          (max (gethash (namestring file) *mbdump-files*))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
96
          (count (floor max 10))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
97
          (lines (random-line-indexes (gethash (namestring file) *mbdump-files*)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
98
          (res (cons out-path count)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
99
     (with-open-files ((out out-path :direction :output :external-format '(:utf-8 :replacement "?"))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
100
                       (in in-path :direction :input :external-format '(:utf-8 :replacement "?")))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
101
       (loop for i in lines
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
102
             with line = (uiop:read-file-line in :at i)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
103
             do (print (file-position in))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
104
             do (write-line line out)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
105
     (push res *target*)))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
106
 
44
Richard Westhaver <ellis@rwest.io>
parents: 41
diff changeset
107
 (defmain ()
41
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
108
   (ensure-directories-exist *target-directory*)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
109
   (let ((workers))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
110
     (dolist (file (hash-table-keys *mbdump-files*) workers)
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
111
       (push (make-thread (lambda () (prep-json-file file)) :name (format nil "~A prep" file)) workers))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
112
     (time (wait-for-threads workers))))
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
113
 
81b7333f27f8 more examples
Richard Westhaver <ellis@rwest.io>
parents:
diff changeset
114
 ;; (prep-json-file "label.json")