# HG changeset patch # User Richard Westhaver # Date 1722805103 14400 # Node ID 60c7b1c83c47fb3f0bbb5e35adc45481929c563b # Parent efb4a19ff530df95034453d6db3d9d913baf2190 more sql query updates diff -r efb4a19ff530 -r 60c7b1c83c47 lisp/lib/obj/pkg.lisp --- a/lisp/lib/obj/pkg.lisp Sun Aug 04 00:18:52 2024 -0400 +++ b/lisp/lib/obj/pkg.lisp Sun Aug 04 16:58:23 2024 -0400 @@ -424,7 +424,20 @@ :lt-expression :gt-expression :neq-expression - :eq-expression)) + :eq-expression + :lhs + :rhs + :logical-plan + :aggregate-expression-p + :df-col + :df-project + :df-filter + :df-aggregate + :df-select + :df-fields + :df-data + :limit + :make-df)) (defpackage :obj/secret (:nicknames :secret) diff -r efb4a19ff530 -r 60c7b1c83c47 lisp/lib/obj/query.lisp --- a/lisp/lib/obj/query.lisp Sun Aug 04 00:18:52 2024 -0400 +++ b/lisp/lib/obj/query.lisp Sun Aug 04 16:58:23 2024 -0400 @@ -171,18 +171,6 @@ (defgeneric scan-data-source (self projection) (:documentation "Scan the data source, selecting the specified columns.")) -;;; Dataframes -;; minimal data-frame abstraction. methods are prefixed with 'DF-'. -(defclass data-frame () - ((fields :initform #() :initarg :fields :accessor df-fields) - (data :initform #() :initarg :data :accessor df-data))) - -(defgeneric df-col (self)) - -(defgeneric df-project (&rest expr &key &allow-other-keys)) -(defgeneric df-filter (expr)) -(defgeneric df-aggregate (group-by agg-expr)) - ;;; Expressions (defclass query-expression () ()) @@ -221,12 +209,12 @@ ;;;;; Alias (defclass alias-expression (logical-expression) - ((expr :type logical-expression :initarg :expr) + ((expr :type logical-expression :initarg :expr :accessor expr) (alias :type string :initarg :alias))) ;;;;; Unary (defclass unary-expression (logical-expression) - ((expr :type logical-expression))) + ((expr :type logical-expression :accessor expr))) ;;;;; Binary (defclass binary-expression (logical-expression) @@ -294,7 +282,7 @@ ;; TODO 2024-08-03: ??? (defmethod to-field ((self math-expression) (input logical-plan)) (declare (ignorable input)) - (make-field :name "mult" :type (field-type (to-field (lhs self) input)))) + (make-field :name "*" :type (field-type (to-field (lhs self) input)))) (defclass add-expression (math-expression) () (:default-initargs @@ -328,7 +316,11 @@ (defclass aggregate-expression (logical-expression) ((name :type string) - (expr :type logical-expression))) + (expr :type logical-expression :accessor expr))) + +(defgeneric aggregate-expression-p (self) + (:method ((self aggregate-expression)) t) + (:method ((self alias-expression)) (aggregate-expression-p (expr self)))) (defmethod to-field ((self aggregate-expression) (input logical-plan)) (declare (ignorable input)) @@ -406,6 +398,114 @@ do (push (to-field a input) ret)) (make-schema :fields (coerce ret 'field-vector)))) +;;;;; Limit +(defclass limit (logical-plan) + ((input :type logical-plan :initarg :input) + (limit :type integer))) + +(defmethod schema ((self limit)) + (setf (slot-value self 'schema) + (schema (slot-value self 'input)))) + +(defmethod children ((self limit)) + (setf (slot-value self 'children) + (children (slot-value self 'input)))) + +;;;;; Joins +(defclass join (logical-plan) + ((left :accessor lhs) + (right :accessor rhs) + (on :accessor join-on))) + +(defclass inner-join (join) ()) +;; (defclass outer-join (join)) +(defclass left-join (join) ()) +(defclass right-join (join) ()) +;; left-outer-join +;; right-outer-join +;; semi-join +;; anti-join +;; cross-join + +(defmethod schema ((self join)) + ;; TODO 2024-08-04: test better dupe impl + (let ((dupes (mapcon #'(lambda (l) (when (eq (car l) (second l)) (list (car l)))) + (coerce (join-on self) 'list))) + (schema (make-instance 'schema))) + (setf (fields schema) + (typecase self + (right-join + (let ((l (remove-if (lambda (x) (member x dupes :test 'string-equal)) (fields (schema (lhs self))))) + (r (fields (schema (rhs self))))) + (merge 'vector l r (lambda (x y) (declare (ignore y)) x)))) + (inner-join + (let ((l (fields (schema (lhs self)))) + (r (remove-if (lambda (x) (member x dupes :test 'string-equal)) (fields (schema (rhs self)))))) + (merge 'vector l r (lambda (x y) (declare (ignore y)) x)))))) + schema)) + +(defmethod children ((self join)) + (vector (lhs self) (rhs self))) + +;;; Subqueries + +;; TODO 2024-08-02: + +;; subquery + +;; correlated-subquery + +;; SELECT id, name, (SELECT count(*) FROM orders WHERE customer_id = customer.id) AS num_orders FROM customers + +;; uncorrelated-subquery + +;; scalar-subquery + +;; SELECT * FROM orders WHERE total > (SELECT avg(total) FROM sales WHERE customer_state = 'CA') + +;; NOTE 2024-08-02: EXISTS, IN, NOT EXISTS, and NOT IN are also subqueries + +;;; Dataframes +;; minimal data-frame abstraction. methods are prefixed with 'DF-'. +(defstruct (data-frame (:constructor make-data-frame (&optional plan))) + (plan (make-instance 'logical-plan) :type logical-plan)) + +(defgeneric df-col (self)) +(defgeneric df-project (df exprs) + (:method ((df data-frame) (expr list)) + (df-project df (coerce expr 'vector))) + (:method ((df data-frame) (expr vector)) + (setf (data-frame-plan df) + (make-instance 'projection + :input (data-frame-plan df) + :expr expr)) + df)) + +(defgeneric df-filter (df expr) + (:method ((df data-frame) (expr logical-expression)) + (setf (data-frame-plan df) + (make-instance 'selection :input (data-frame-plan df) :expr expr)) + df)) + +(defgeneric df-aggregate (df group-by agg-expr) + (:method ((df data-frame) (group-by vector) (agg-expr vector)) + (setf (data-frame-plan df) + (make-instance 'aggregate :input (data-frame-plan df) + :group-expr group-by + :agg-expr agg-expr)) + df) + (:method ((df data-frame) (group-by list) (agg-expr list)) + (df-aggregate df (coerce group-by 'vector) (coerce agg-expr 'vector)))) + +(defgeneric make-df (&rest initargs &key &allow-other-keys)) + +(defmethod schema ((df data-frame)) + (schema (data-frame-plan df))) + +(defgeneric df-plan (df) + (:documentation "Return the logical plan associated with this data-frame.") + (:method ((df data-frame)) (data-frame-plan df))) + ;;; Physical Expression (defclass physical-expression (query-expression) ()) @@ -685,38 +785,6 @@ :group-expr (make-physical-expression (slot-value plan 'group-expr) (slot-value plan 'input)) :agg-expr (make-physical-expression (slot-value plan 'agg-expr) (slot-value plan 'input))))))) -;;; Joins - -;; TODO 2024-08-02: - -;; inner-join - -;; outer-join left-outer-join right-outer-join - -;; semi-join - -;; anti-join - -;; cross-join - -;;; Subqueries - -;; TODO 2024-08-02: - -;; subquery - -;; correlated-subquery - -;; SELECT id, name, (SELECT count(*) FROM orders WHERE customer_id = customer.id) AS num_orders FROM customers - -;; uncorrelated-subquery - -;; scalar-subquery - -;; SELECT * FROM orders WHERE total > (SELECT avg(total) FROM sales WHERE customer_state = 'CA') - -;; NOTE 2024-08-02: EXISTS, IN, NOT EXISTS, and NOT IN are also subqueries - ;;; Optimizer ;; The Query Optimizer is responsible for walking a QUERY-PLAN and returning a diff -r efb4a19ff530 -r 60c7b1c83c47 lisp/lib/q/sql.lisp --- a/lisp/lib/q/sql.lisp Sun Aug 04 00:18:52 2024 -0400 +++ b/lisp/lib/q/sql.lisp Sun Aug 04 16:58:23 2024 -0400 @@ -37,7 +37,7 @@ (defun illegal-sql-state (state) (error 'illegal-sql-state :state state)) -;;; Objects +;;; Logical Classes (defclass sql-query (query) ()) (defclass sql-data-source (data-source) () @@ -83,10 +83,6 @@ (having :type (or null sql-expression) :initarg :having) (table-name :type string :initarg :table-name))) -(defclass sql-planner (query-planner) ()) - -(defclass sql-optimizer (query-optimizer) ()) - ;;; Lexer (eval-always (defvar *sql-token-types* (list :ident :str :num :kw :op :sym t)) @@ -550,7 +546,7 @@ #.(get-sql-symbol :EQ) #.(get-sql-symbol :GT) #.(get-sql-symbol :LT)) :test 'string=) - ;; (pop tokens) ;; consume + (pop (sql-tokens self)) ;; consume (make-instance 'sql-math-expression :lhs left :op (sql-token-text token) @@ -670,6 +666,46 @@ `(with-sql-parser (,sym (read-sql-stream ,stream)) ,@body)) +;;; Planner +(defclass sql-logical-plan (logical-plan) ()) +(defclass sql-physical-plan (physical-plan) ()) + +(defmethod make-physical-expression ((expr sql-expression) (input sql-logical-plan))) +(defmethod make-physical-plan ((plan sql-logical-plan))) + +(defclass sql-planner (query-planner) ()) + +(defun make-sql-logical-expression ()) +(defun get-ref-columns ()) +(defun get-selection-ref-columns ()) +(defun plan-non-aggregate-query ()) +(defun plan-aggregate-query ()) + +(defun make-sql-data-frame (select tables) + (let* ((table (gethash (slot-value select 'table-name) + tables + (simple-sql-error "No table named ~A" (slot-value select 'table-name)))) + (proj (map 'vector + (lambda (x) (make-sql-logical-expression x table)) + (slot-value select 'projection))) + (cols-in-proj (get-ref-columns proj)) + (agg-count (count-if 'aggregate-expression-p proj))) + (when (and (zerop agg-count) (not (sequence:emptyp (slot-value select 'group-by)))) + (simple-sql-error "GROUP BY without aggregate expression is not supported")) + (let ((cols-in-sel (get-selection-ref-columns select table)) + (plan table)) + (if (zerop agg-count) + (plan-non-aggregate-query select plan proj cols-in-sel cols-in-proj) + (let ((pro) + (agg) + (n-group-cols 0) + (group-count 0)) + plan))))) + +;;; Optimizer +(defclass sql-optimizer (query-optimizer) ()) + +;;; Top-level Macros (defmacro with-sql ((sym input &key (parse t) optimize execute) &body body) (declare (ignore optimize execute)) `(with-sql-parser (,sym ,@(etypecase input diff -r efb4a19ff530 -r 60c7b1c83c47 lisp/lib/q/tests.lisp --- a/lisp/lib/q/tests.lisp Sun Aug 04 00:18:52 2024 -0400 +++ b/lisp/lib/q/tests.lisp Sun Aug 04 16:58:23 2024 -0400 @@ -4,7 +4,7 @@ ;;; Code: (defpackage :q/tests - (:use :cl :std :rt :q :log :parse/pratt)) + (:use :cl :std :rt :q :log :parse/pratt :obj/query)) (in-package :q/tests) @@ -15,16 +15,14 @@ (deftest sanity ()) (deftest sql-select () - (is (typep - (with-sql-parser (expr (read-sql-string "SELECT a,b,c FROM FOO")) - (parse expr)) - 'sql-select))) + (with-sql-parser (expr (read-sql-string "SELECT * FROM FOO")) + (is (typep (parse expr) 'sql-select)))) (deftest sql-math () - (is (typep - (with-sql (expr "1 + 2 * 3") - expr) - 'sql-math-expression))) + (with-sql (expr "1 + 2 * 3") + (is (typep expr 'sql-math-expression)) + (is (typep (rhs expr) 'sql-math-expression)) + (is (typep (lhs expr) 'sql-number)))) ;; https://www.cpp.edu/~jrfisher/www/prolog_tutorial/2_1.html (deftest dql ()