diff options
author | Hans Huebner <hans.huebner@gmail.com> | 2004-06-23 08:26:55 +0000 |
---|---|---|
committer | Hans Huebner <hans.huebner@gmail.com> | 2004-06-23 08:26:55 +0000 |
commit | 4122284075b3e259c691956f4a533292df9ffdfa (patch) | |
tree | f3344bac978fdd960ad822b8871527ba196b8d16 /convert.lisp |
Initial revision
git-svn-id: svn://bknr.net/svn/trunk/thirdparty/cl-ppcre@12 4281704c-cde7-0310-8518-8e2dc76b1ff0
Diffstat (limited to 'convert.lisp')
-rw-r--r-- | convert.lisp | 775 |
1 files changed, 775 insertions, 0 deletions
diff --git a/convert.lisp b/convert.lisp new file mode 100644 index 0000000..c0bbff7 --- /dev/null +++ b/convert.lisp @@ -0,0 +1,775 @@ +;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: CL-PPCRE; Base: 10 -*- +;;; $Header: /home/manuel/bknr-cvs/cvs/thirdparty/cl-ppcre/convert.lisp,v 1.1 2004/06/23 08:27:10 hans Exp $ + +;;; Here the parse tree is converted into its internal representation +;;; using REGEX objects. At the same time some optimizations are +;;; already applied. + +;;; Copyright (c) 2002-2003, Dr. Edmund Weitz. All rights reserved. + +;;; Redistribution and use in source and binary forms, with or without +;;; modification, are permitted provided that the following conditions +;;; are met: + +;;; * Redistributions of source code must retain the above copyright +;;; notice, this list of conditions and the following disclaimer. + +;;; * Redistributions in binary form must reproduce the above +;;; copyright notice, this list of conditions and the following +;;; disclaimer in the documentation and/or other materials +;;; provided with the distribution. + +;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED +;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY +;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE +;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +(in-package #:cl-ppcre) + +;;; The flags that represent the "ism" modifiers are always kept +;;; together in a three-element list. We use the following macros to +;;; access individual elements. + +(defmacro case-insensitive-mode-p (flags) + "Accessor macro to extract the first flag out of a three-element flag list." + `(first ,flags)) + +(defmacro multi-line-mode-p (flags) + "Accessor macro to extract the second flag out of a three-element flag list." + `(second ,flags)) + +(defmacro single-line-mode-p (flags) + "Accessor macro to extract the third flag out of a three-element flag list." + `(third ,flags)) + +(defun set-flag (token) + (declare (optimize speed + (safety 0) + (space 0) + (debug 0) + (compilation-speed 0) + #+:lispworks (hcl:fixnum-safety 0))) + (declare (special flags)) + "Reads a flag token and sets or unsets the corresponding entry in +the special FLAGS list." + (case token + ((:case-insensitive-p) + (setf (case-insensitive-mode-p flags) t)) + ((:case-sensitive-p) + (setf (case-insensitive-mode-p flags) nil)) + ((:multi-line-mode-p) + (setf (multi-line-mode-p flags) t)) + ((:not-multi-line-mode-p) + (setf (multi-line-mode-p flags) nil)) + ((:single-line-mode-p) + (setf (single-line-mode-p flags) t)) + ((:not-single-line-mode-p) + (setf (single-line-mode-p flags) nil)) + (otherwise + (signal-ppcre-syntax-error "Unknown flag token ~A" token)))) + +(defun add-range-to-hash (hash from to) + (declare (optimize speed + (safety 0) + (space 0) + (debug 0) + (compilation-speed 0) + #+:lispworks (hcl:fixnum-safety 0))) + (declare (special flags)) + "Adds all characters from character FROM to character TO (inclusive) +to the char class hash HASH. Does the right thing with respect to +case-(in)sensitivity as specified by the special variable FLAGS." + (let ((from-code (char-code from)) + (to-code (char-code to))) + (when (> from-code to-code) + (signal-ppcre-syntax-error "Invalid range from ~A to ~A in char-class" + from to)) + (cond ((case-insensitive-mode-p flags) + (loop for code from from-code to to-code + for chr = (code-char code) + do (setf (gethash (char-upcase chr) hash) t + (gethash (char-downcase chr) hash) t))) + (t + (loop for code from from-code to to-code + do (setf (gethash (code-char code) hash) t)))) + hash)) + +(defun convert-char-class-to-hash (list) + (declare (optimize speed + (safety 0) + (space 0) + (debug 0) + (compilation-speed 0) + #+:lispworks (hcl:fixnum-safety 0))) + "Combines all items in LIST into one char class hash and returns it. +Items can be single characters, character ranges like \(:RANGE #\\A +#\\E), or special character classes like :DIGIT-CLASS. Does the right +thing with respect to case-\(in)sensitivity as specified by the +special variable FLAGS." + (loop with hash = (make-hash-table :size (ceiling (expt *regex-char-code-limit* (/ 1 4))) + :rehash-size (float (expt *regex-char-code-limit* (/ 1 4))) + :rehash-threshold 1.0) + for item in list + if (characterp item) + ;; treat a single character C like a range (:RANGE C C) + do (add-range-to-hash hash item item) + else if (symbolp item) + ;; special character classes + do (setq hash + (case item + ((:digit-class) + (merge-hash hash +digit-hash+)) + ((:non-digit-class) + (merge-inverted-hash hash +digit-hash+)) + ((:whitespace-char-class) + (merge-hash hash +whitespace-char-hash+)) + ((:non-whitespace-char-class) + (merge-inverted-hash hash +whitespace-char-hash+)) + ((:word-char-class) + (merge-hash hash +word-char-hash+)) + ((:non-word-char-class) + (merge-inverted-hash hash +word-char-hash+)) + (otherwise + (signal-ppcre-syntax-error + "Unknown symbol ~A in character class" + item)))) + else if (and (consp item) + (eq (car item) :range)) + ;; proper ranges + do (add-range-to-hash hash + (second item) + (third item)) + else do (signal-ppcre-syntax-error "Unknown item ~A in char-class list" + item) + finally (return hash))) + +(defun maybe-split-repetition (regex + greedyp + minimum + maximum + min-len + length + reg-seen) + (declare (optimize speed + (safety 0) + (space 0) + (debug 0) + (compilation-speed 0) + #+:lispworks (hcl:fixnum-safety 0))) + (declare (type fixnum minimum) + (type (or fixnum null) maximum)) + "Splits a REPETITION object into a constant and a varying part if +applicable, i.e. something like + a{3,} -> a{3}a* +The arguments to this function correspond to the REPETITION slots of +the same name." + ;; note the usage of COPY-REGEX here; we can't use the same REGEX + ;; object in both REPETITIONS because they will have different + ;; offsets + (when maximum + (when (zerop maximum) + ;; trivial case: don't repeat at all + (return-from maybe-split-repetition + (make-instance 'void))) + (when (= 1 minimum maximum) + ;; another trivial case: "repeat" exactly once + (return-from maybe-split-repetition + regex))) + ;; first set up the constant part of the repetition + ;; maybe that's all we need + (let ((constant-repetition (if (plusp minimum) + (make-instance 'repetition + :regex (copy-regex regex) + :greedyp greedyp + :minimum minimum + :maximum minimum + :min-len min-len + :len length + :contains-register-p reg-seen) + ;; don't create garbage if minimum is 0 + nil))) + (when (and maximum + (= maximum minimum)) + (return-from maybe-split-repetition + ;; no varying part needed because min = max + constant-repetition)) + ;; now construct the varying part + (let ((varying-repetition + (make-instance 'repetition + :regex regex + :greedyp greedyp + :minimum 0 + :maximum (if maximum (- maximum minimum) nil) + :min-len min-len + :len length + :contains-register-p reg-seen))) + (cond ((zerop minimum) + ;; min = 0, no constant part needed + varying-repetition) + ((= 1 minimum) + ;; min = 1, constant part needs no REPETITION wrapped around + (make-instance 'seq + :elements (list (copy-regex regex) + varying-repetition))) + (t + ;; general case + (make-instance 'seq + :elements (list constant-repetition + varying-repetition))))))) + +;; During the conversion of the parse tree we keep track of the start +;; of the parse tree in the special variable STARTS-WITH which'll +;; either hold a STR object or an EVERYTHING object. The latter is the +;; case if the regex starts with ".*" which implicitely anchors the +;; regex at the start (perhaps modulo #\Newline). + +(defmethod maybe-accumulate ((str str)) + (declare (optimize speed + (safety 0) + (space 0) + (debug 0) + (compilation-speed 0) + #+:lispworks (hcl:fixnum-safety 0))) + (declare (special accumulate-start-p starts-with)) + (declare (ftype (function (t) fixnum) len)) + "Accumulate STR into the special variable STARTS-WITH if +ACCUMULATE-START-P (also special) is true and STARTS-WITH is either +NIL or a STR object of the same case mode. Always returns NIL." + (when accumulate-start-p + (etypecase starts-with + (str + ;; STARTS-WITH already holds a STR, so we check if we can + ;; concatenate + (cond ((eq (case-insensitive-p starts-with) + (case-insensitive-p str)) + ;; we modify STARTS-WITH in place + (setf (len starts-with) + (+ (len starts-with) (len str))) + ;; note that we use SLOT-VALUE because the accessor + ;; STR has a declared FTYPE which doesn't fit here + (adjust-array (slot-value starts-with 'str) + (len starts-with) + :fill-pointer t) + (setf (subseq (slot-value starts-with 'str) + (- (len starts-with) (len str))) + (str str) + ;; STR objects that are parts of STARTS-WITH + ;; always have their SKIP slot set to true + ;; because the SCAN function will take care of + ;; them, i.e. the matcher can ignore them + (skip str) t)) + (t (setq accumulate-start-p nil)))) + (null + ;; STARTS-WITH is still empty, so we create a new STR object + (setf starts-with + (make-instance 'str + :str "" + :case-insensitive-p (case-insensitive-p str)) + ;; INITIALIZE-INSTANCE will coerce the STR to a simple + ;; string, so we have to fill it afterwards + (slot-value starts-with 'str) + (make-array (len str) + :initial-contents (str str) + :element-type 'character + :fill-pointer t + :adjustable t) + (len starts-with) + (len str) + ;; see remark about SKIP above + (skip str) t)) + (everything + ;; STARTS-WITH already holds an EVERYTHING object - we can't + ;; concatenate + (setq accumulate-start-p nil)))) + nil) + +(defun convert-aux (parse-tree) + (declare (optimize speed + (safety 0) + (space 0) + (debug 0) + (compilation-speed 0) + #+:lispworks (hcl:fixnum-safety 0))) + (declare (special flags reg-num accumulate-start-p starts-with max-back-ref)) + "Converts the parse tree PARSE-TREE into a REGEX object and returns it. + +Will also + - split and optimize repetitions, + - accumulate strings or EVERYTHING objects into the special variable + STARTS-WITH, + - keep track of all registers seen in the special variable REG-NUM, + - keep track of the highest backreference seen in the special + variable MAX-BACK-REF, + - maintain and adher to the currently applicable modifiers in the special + variable FLAGS, and + - maybe even wash your car..." + (cond ((consp parse-tree) + (case (first parse-tree) + ;; (:SEQUENCE {<regex>}*) + ((:sequence) + (cond ((cddr parse-tree) + ;; this is essentially like + ;; (MAPCAR 'CONVERT-AUX (REST PARSE-TREE)) + ;; but we don't cons a new list + (loop for parse-tree-rest on (rest parse-tree) + while parse-tree-rest + do (setf (car parse-tree-rest) + (convert-aux (car parse-tree-rest)))) + (make-instance 'seq + :elements (rest parse-tree))) + (t (convert-aux (second parse-tree))))) + ;; (:GROUP {<regex>}*) + ;; this is a syntactical construct equivalent to :SEQUENCE + ;; intended to keep the effect of modifiers local + ((:group) + ;; make a local copy of FLAGS and shadow the global + ;; value while we descend into the enclosed regexes + (let ((flags (copy-list flags))) + (declare (special flags)) + (cond ((cddr parse-tree) + (loop for parse-tree-rest on (rest parse-tree) + while parse-tree-rest + do (setf (car parse-tree-rest) + (convert-aux (car parse-tree-rest)))) + (make-instance 'seq + :elements (rest parse-tree))) + (t (convert-aux (second parse-tree)))))) + ;; (:ALTERNATION {<regex>}*) + ((:alternation) + ;; we must stop accumulating objects into STARTS-WITH + ;; once we reach an alternation + (setq accumulate-start-p nil) + (loop for parse-tree-rest on (rest parse-tree) + while parse-tree-rest + do (setf (car parse-tree-rest) + (convert-aux (car parse-tree-rest)))) + (make-instance 'alternation + :choices (rest parse-tree))) + ;; (:BRANCH <test> <regex>) + ;; <test> must be look-ahead, look-behind or number; + ;; if <regex> is an alternation it must have one or two + ;; choices + ((:branch) + (setq accumulate-start-p nil) + (let* ((test-candidate (second parse-tree)) + (test (cond ((numberp test-candidate) + (when (zerop (the fixnum test-candidate)) + (signal-ppcre-syntax-error + "Register 0 doesn't exist: ~S" + parse-tree)) + (1- (the fixnum test-candidate))) + (t (convert-aux test-candidate)))) + (alternations (convert-aux (third parse-tree)))) + (when (and (not (numberp test)) + (not (typep test 'lookahead)) + (not (typep test 'lookbehind))) + (signal-ppcre-syntax-error + "Branch test must be look-ahead, look-behind or number: ~S" + parse-tree)) + (typecase alternations + (alternation + (case (length (choices alternations)) + ((0) + (signal-ppcre-syntax-error "No choices in branch: ~S" + parse-tree)) + ((1) + (make-instance 'branch + :test test + :then-regex (first + (choices alternations)))) + ((2) + (make-instance 'branch + :test test + :then-regex (first + (choices alternations)) + :else-regex (second + (choices alternations)))) + (otherwise + (signal-ppcre-syntax-error + "Too much choices in branch: ~S" + parse-tree)))) + (t + (make-instance 'branch + :test test + :then-regex alternations))))) + ;; (:POSITIVE-LOOKAHEAD|:NEGATIVE-LOOKAHEAD <regex>) + ((:positive-lookahead :negative-lookahead) + ;; keep the effect of modifiers local to the enclosed + ;; regex and stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (let ((flags (copy-list flags))) + (declare (special flags)) + (make-instance 'lookahead + :regex (convert-aux (second parse-tree)) + :positivep (eq (first parse-tree) + :positive-lookahead)))) + ;; (:POSITIVE-LOOKBEHIND|:NEGATIVE-LOOKBEHIND <regex>) + ((:positive-lookbehind :negative-lookbehind) + ;; keep the effect of modifiers local to the enclosed + ;; regex and stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (let* ((flags (copy-list flags)) + (regex (convert-aux (second parse-tree))) + (len (regex-length regex))) + (declare (special flags)) + ;; lookbehind assertions must be of fixed length + (unless len + (signal-ppcre-syntax-error + "Variable length look-behind not implemented (yet): ~S" + parse-tree)) + (make-instance 'lookbehind + :regex regex + :positivep (eq (first parse-tree) + :positive-lookbehind) + :len len))) + ;; (:GREEDY-REPETITION|:NON-GREEDY-REPETITION <min> <max> <regex>) + ((:greedy-repetition :non-greedy-repetition) + ;; remember the value of ACCUMULATE-START-P upon entering + (let ((local-accumulate-start-p accumulate-start-p)) + (let ((minimum (second parse-tree)) + (maximum (third parse-tree))) + (declare (type fixnum minimum)) + (declare (type (or null fixnum) maximum)) + (unless (and maximum + (= 1 minimum maximum)) + ;; set ACCUMULATE-START-P to NIL for the rest of + ;; the conversion because we can't continue to + ;; accumulate inside as well as after a proper + ;; repetition + (setq accumulate-start-p nil)) + (let* (reg-seen + (regex (convert-aux (fourth parse-tree))) + (min-len (regex-min-length regex)) + (greedyp (eq (first parse-tree) :greedy-repetition)) + (length (regex-length regex))) + ;; note that this declaration already applies to + ;; the call to CONVERT-AUX above + (declare (special reg-seen)) + (when (and local-accumulate-start-p + (not starts-with) + (zerop minimum) + (not maximum)) + ;; if this repetition is (equivalent to) ".*" + ;; and if we're at the start of the regex we + ;; remember it for ADVANCE-FN (see the SCAN + ;; function) + (setq starts-with (everythingp regex))) + (if (or (not reg-seen) + (not greedyp) + (not length) + (zerop length) + (and maximum (= minimum maximum))) + ;; the repetition doesn't enclose a register, or + ;; it's not greedy, or we can't determine it's + ;; (inner) length, or the length is zero, or the + ;; number of repetitions is fixed; in all of + ;; these cases we don't bother to optimize + (maybe-split-repetition regex + greedyp + minimum + maximum + min-len + length + reg-seen) + ;; otherwise we make a transformation that looks + ;; roughly like one of + ;; <regex>* -> (?:<regex'>*<regex>)? + ;; <regex>+ -> <regex'>*<regex> + ;; where the trick is that as much as possible + ;; registers from <regex> are removed in + ;; <regex'> + (let* (reg-seen ; new instance for REMOVE-REGISTERS + (remove-registers-p t) + (inner-regex (remove-registers regex)) + (inner-repetition + ;; this is the "<regex'>" part + (maybe-split-repetition inner-regex + ;; always greedy + t + ;; reduce minimum by 1 + ;; unless it's already 0 + (if (zerop minimum) + 0 + (1- minimum)) + ;; reduce maximum by 1 + ;; unless it's NIL + (and maximum + (1- maximum)) + min-len + length + reg-seen)) + (inner-seq + ;; this is the "<regex'>*<regex>" part + (make-instance 'seq + :elements (list inner-repetition + regex)))) + ;; note that this declaration already applies + ;; to the call to REMOVE-REGISTERS above + (declare (special remove-registers-p reg-seen)) + ;; wrap INNER-SEQ with a greedy + ;; {0,1}-repetition (i.e. "?") if necessary + (if (plusp minimum) + inner-seq + (maybe-split-repetition inner-seq + t + 0 + 1 + min-len + nil + t)))))))) + ;; (:REGISTER <regex>) + ((:register) + ;; keep the effect of modifiers local to the enclosed + ;; regex; also, assign the current value of REG-NUM to + ;; the corresponding slot of the REGISTER object and + ;; increase this counter afterwards + (let ((flags (copy-list flags)) + (stored-reg-num reg-num)) + (declare (special flags reg-seen)) + (setq reg-seen t) + (incf (the fixnum reg-num)) + (make-instance 'register + :regex (convert-aux (second parse-tree)) + :num stored-reg-num))) + ;; (:STANDALONE <regex>) + ((:standalone) + ;; keep the effect of modifiers local to the enclosed + ;; regex + (let ((flags (copy-list flags))) + (declare (special flags)) + (make-instance 'standalone + :regex (convert-aux (second parse-tree))))) + ;; (:BACK-REFERENCE <number>) + ((:back-reference) + (let ((backref-number (second parse-tree))) + (declare (type fixnum backref-number)) + (when (or (not (typep backref-number 'fixnum)) + (<= backref-number 0)) + (signal-ppcre-syntax-error + "Illegal back-reference: ~S" + parse-tree)) + ;; stop accumulating into STARTS-WITH and increase + ;; MAX-BACK-REF if necessary + (setq accumulate-start-p nil + max-back-ref (max (the fixnum max-back-ref) + backref-number)) + (make-instance 'back-reference + ;; we start counting from 0 internally + :num (1- backref-number) + :case-insensitive-p (case-insensitive-mode-p + flags)))) + ;; (:CHAR-CLASS|:INVERTED-CHAR-CLASS {<item>}*) + ;; where item is one of + ;; - a character + ;; - a character range: (:RANGE <char1> <char2>) + ;; - a special char class symbol like :DIGIT-CHAR-CLASS + ((:char-class :inverted-char-class) + ;; first create the hash-table and some auxiliary values + (let* (hash + hash-keys + (count most-positive-fixnum) + (item-list (rest parse-tree)) + (invertedp (eq (first parse-tree) :inverted-char-class)) + word-char-class-p) + (cond ((every (lambda (item) (eq item :word-char-class)) + item-list) + ;; treat "[\\w]" like "\\w" + (setq word-char-class-p t)) + ((every (lambda (item) (eq item :non-word-char-class)) + item-list) + ;; treat "[\\W]" like "\\W" + (setq word-char-class-p t) + (setq invertedp (not invertedp))) + (t + (setq hash (convert-char-class-to-hash item-list) + count (hash-table-count hash)) + (when (<= count 2) + ;; collect the hash-table keys into a list if + ;; COUNT is smaller than 3 + (setq hash-keys + (loop for chr being the hash-keys of hash + collect chr))))) + (cond ((and (not invertedp) + (= count 1)) + ;; convert one-element hash table into a STR + ;; object and try to accumulate into + ;; STARTS-WITH + (let ((str (make-instance 'str + :str (string + (first hash-keys)) + :case-insensitive-p nil))) + (maybe-accumulate str) + str)) + ((and (not invertedp) + (= count 2) + (char-equal (first hash-keys) (second hash-keys))) + ;; convert two-element hash table into a + ;; case-insensitive STR object and try to + ;; accumulate into STARTS-WITH if the two + ;; characters are CHAR-EQUAL + (let ((str (make-instance 'str + :str (string + (first hash-keys)) + :case-insensitive-p t))) + (maybe-accumulate str) + str)) + (t + ;; the general case; stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (make-instance 'char-class + :hash hash + :case-insensitive-p + (case-insensitive-mode-p flags) + :invertedp invertedp + :word-char-class-p word-char-class-p))))) + ;; (:FLAGS {<flag>}*) + ;; where flag is a modifier symbol like :CASE-INSENSITIVE-P + ((:flags) + ;; set/unset the flags corresponding to the symbols + ;; following :FLAGS + (mapc #'set-flag (rest parse-tree)) + ;; we're only interested in the side effect of + ;; setting/unsetting the flags and turn this syntactical + ;; construct into a VOID object which'll be optimized + ;; away when creating the matcher + (make-instance 'void)) + (otherwise + (signal-ppcre-syntax-error + "Unknown token ~A in parse-tree" + (first parse-tree))))) + ((or (characterp parse-tree) (stringp parse-tree)) + ;; turn characters or strings into STR objects and try to + ;; accumulate into STARTS-WITH + (let ((str (make-instance 'str + :str (string parse-tree) + :case-insensitive-p + (case-insensitive-mode-p flags)))) + (maybe-accumulate str) + str)) + (t + ;; and now for the tokens which are symbols + (case parse-tree + ((:void) + (make-instance 'void)) + ((:word-boundary) + (make-instance 'word-boundary :negatedp nil)) + ((:non-word-boundary) + (make-instance 'word-boundary :negatedp t)) + ;; the special character classes + ((:digit-class + :non-digit-class + :word-char-class + :non-word-char-class + :whitespace-char-class + :non-whitespace-char-class) + ;; stop accumulating into STARTS-WITH + (setq accumulate-start-p nil) + (make-instance 'char-class + ;; use the constants defined in util.lisp + :hash (case parse-tree + ((:digit-class + :non-digit-class) + +digit-hash+) + ((:word-char-class + :non-word-char-class) + nil) + ((:whitespace-char-class + :non-whitespace-char-class) + +whitespace-char-hash+)) + ;; this value doesn't really matter but + ;; NIL should result in slightly faster + ;; matchers + :case-insensitive-p nil + :invertedp (member parse-tree + '(:non-digit-class + :non-word-char-class + :non-whitespace-char-class) + :test #'eq) + :word-char-class-p (member parse-tree + '(:word-char-class + :non-word-char-class) + :test #'eq))) + ((:start-anchor ; Perl's "^" + :end-anchor ; Perl's "$" + :modeless-end-anchor-no-newline + ; Perl's "\z" + :modeless-start-anchor ; Perl's "\A" + :modeless-end-anchor) ; Perl's "\Z" + (make-instance 'anchor + :startp (member parse-tree + '(:start-anchor + :modeless-start-anchor) + :test #'eq) + ;; set this value according to the + ;; current settings of FLAGS (unless it's + ;; a modeless anchor) + :multi-line-p + (and (multi-line-mode-p flags) + (not (member parse-tree + '(:modeless-start-anchor + :modeless-end-anchor + :modeless-end-anchor-no-newline) + :test #'eq))) + :no-newline-p + (eq parse-tree + :modeless-end-anchor-no-newline))) + ((:everything) + ;; stop accumulating into STARTS-WITHS + (setq accumulate-start-p nil) + (make-instance 'everything + :single-line-p (single-line-mode-p flags))) + ;; special tokens corresponding to Perl's "ism" modifiers + ((:case-insensitive-p + :case-sensitive-p + :multi-line-mode-p + :not-multi-line-mode-p + :single-line-mode-p + :not-single-line-mode-p) + ;; we're only interested in the side effect of + ;; setting/unsetting the flags and turn these tokens + ;; into VOID objects which'll be optimized away when + ;; creating the matcher + (set-flag parse-tree) + (make-instance 'void)) + (otherwise + (signal-ppcre-syntax-error "Unknown token ~A in parse-tree" + parse-tree)))))) + +(defun convert (parse-tree) + (declare (optimize speed + (safety 0) + (space 0) + (debug 0) + (compilation-speed 0) + #+:lispworks (hcl:fixnum-safety 0))) + "Converts the parse tree PARSE-TREE into an equivalent REGEX object +and returns three values: the REGEX object, the number of registers +seen and an object the regex starts with which is either a STR object +or an EVERYTHING object (if the regex starts with something like +\".*\") or NIL." + ;; this function basically just initializes the special variables + ;; and then calls CONVERT-AUX to do all the work + (let* ((flags (list nil nil nil)) + (reg-num 0) + (accumulate-start-p t) + starts-with + (max-back-ref 0) + (converted-parse-tree (convert-aux parse-tree))) + (declare (special flags reg-num accumulate-start-p starts-with max-back-ref)) + ;; make sure we don't reference registers which aren't there + (when (> (the fixnum max-back-ref) + (the fixnum reg-num)) + (signal-ppcre-syntax-error + "Backreference to register ~A which has not been defined" + max-back-ref)) + (when (typep starts-with 'str) + (setf (slot-value starts-with 'str) + (coerce (slot-value starts-with 'str) 'simple-string))) + (values converted-parse-tree reg-num starts-with))) |