changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > demo / examples/mbdump.lisp

changeset 44: 99d4ab4f8d53
parent: 81b7333f27f8
author: Richard Westhaver <ellis@rwest.io>
date: Sun, 11 Aug 2024 01:50:18 -0400
permissions: -rw-r--r--
description: update
1 ;;; examples/mbdump.lisp --- Prepare a sampling of mbdump JSON data
2 
3 ;; WIP
4 
5 ;;; Commentary:
6 
7 ;; - considering sampling 'releases.json' only. could be a really good
8 ;; benchmark. For now we will sample all files. Soon, we may split
9 ;; releases.json into separate files here which is rather trivial
10 ;; anyway.
11 
12 ;; - using uiop:read-file-line is NOT the right thing to do. This is
13 ;; too bad because I implemented a specialized stream class and then
14 ;; deleted it before committing.
15 
16 ;; - there are two possible solutions I can think of:
17 
18 ;; - single-pass :: for each file, read the first line and calculate
19 ;; the minimal space needed to store a json object in a single
20 ;; line. Instead of incrementing over every character to find the
21 ;; next line, we move the position once by the minimum space, then
22 ;; iterate over characters until we find a newline. We walk the
23 ;; entire file and pick up the random indexes.
24 
25 ;; - double-pass :: for each file, read each line character by
26 ;; character, counting new lines. At each random index calculate
27 ;; and collect the file position. Do a second pass which sets the
28 ;; file position on each iteration before reading a line.
29 
30 ;;; Code:
31 #-prelude (ql:quickload :prelude)
32 (defpackage :mbdump
33  (:use :cl :std :log :sb-thread :sb-concurrency :dat/json :cli/clap :obj/time :sb-gray)
34  (:export :main :*target*))
35 
36 (in-package :mbdump)
37 
38 ;; Ultimately we dump the samples to this directory. It should be
39 ;; roughly 1/10th the original size.
40 #| (in-readtable :shell)
41 du -sh data/mbdump # 242G
42 du -sh /tmp/mbdump # 24G
43 |#
44 (defvar *mbdump-directory* (pathname "/mnt/y/data/packy/data/mbdump-full/"))
45 
46 (defun init-mbdump-files (&optional (dir *mbdump-directory*))
47  "Count the total number of lines in each file under DIR. Return a
48 hash-table containing filenames->line counts.
49 
50 This is single-threaded so it does take some time on the full mbdump
51 dataset. If you run this make sure to assign the resulting value to
52 *MBDUMP-FILES*, otherwise use the pre-compiled value."
53  (let ((files (find-files dir))
54  (table (make-hash-table :test 'equal)))
55  (mapc (lambda (f)
56  (setf (gethash (file-namestring f) table) (count-file-lines f)))
57  files)
58  table))
59 
60 (defvar *mbdump-files* (let ((pairs '(("area.json" . 119164)
61  ("artist.json" . 2345810)
62  ("event.json" . 78896)
63  ("instrument.json" . 1046)
64  ("label.json" . 271609)
65  ("place.json" . 63772)
66  ("recording.json" . 119575)
67  ("release-group.json" . 3204634)
68  ("release.json" . 4111554)
69  ("series.json" . 23376)
70  ("work.json" . 2078152)))
71  (table (make-hash-table :test 'equal)))
72  (dolist (pair pairs table)
73  (setf (gethash (car pair) table) (cdr pair)))))
74 
75 (defvar *target-directory* (pathname (concatenate 'string "/tmp/mbdump-" (file-date) "/")))
76 
77 (defvar *target* nil)
78 
79 (defun random-line-indexes (max &optional (count 1000))
80  (declare (fixnum max count))
81  (let ((ret))
82  (labels ((%gen () (let ((int (random max)))
83  (when (zerop int) (setf int 1))
84  (if (find int ret)
85  (%gen)
86  int))))
87  (sort
88  (dotimes (i count ret)
89  (setf ret (cons (%gen) ret)))
90  #'<))))
91 
92 (defun prep-json-file (file)
93  (let* ((in-path (merge-pathnames file *mbdump-directory*))
94  (out-path (merge-pathnames file *target-directory*))
95  (max (gethash (namestring file) *mbdump-files*))
96  (count (floor max 10))
97  (lines (random-line-indexes (gethash (namestring file) *mbdump-files*)))
98  (res (cons out-path count)))
99  (with-open-files ((out out-path :direction :output :external-format '(:utf-8 :replacement "?"))
100  (in in-path :direction :input :external-format '(:utf-8 :replacement "?")))
101  (loop for i in lines
102  with line = (uiop:read-file-line in :at i)
103  do (print (file-position in))
104  do (write-line line out)))
105  (push res *target*)))
106 
107 (defmain ()
108  (ensure-directories-exist *target-directory*)
109  (let ((workers))
110  (dolist (file (hash-table-keys *mbdump-files*) workers)
111  (push (make-thread (lambda () (prep-json-file file)) :name (format nil "~A prep" file)) workers))
112  (time (wait-for-threads workers))))
113 
114 ;; (prep-json-file "label.json")