core:lisp/lib/nlp/stem/porter.lisp

changelog shortlog graph tags branches changeset files revisions annotate raw help
Mercurial > core / lisp/lib/nlp/stem/porter.lisp

changeset 698:	96958d3eb5b0
parent:	daad2b8bb63f
author:	Richard Westhaver <ellis@rwest.io>
date:	Fri, 04 Oct 2024 22:04:59 -0400
permissions:	-rw-r--r--
description:	fixes
     1 ;;; stem/porter.lisp --- Porter Stemming Algorithm
     2 
     3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
     4 ;; The software is completely free for any purpose, unless notes at
     5 ;; the head of the program text indicates otherwise (which is
     6 ;; rare). In any case, the notes about licensing are never more
     7 ;; restrictive than the BSD License.
     8 ;
     9 ;; In every case where the software is not written by me (Martin
    10 ;; Porter), this licensing arrangement has been endorsed by the
    11 ;; contributor, and it is therefore unnecessary to ask the contributor
    12 ;; again to confirm it.
    13 ;
    14 ;; The Porter Stemming Algorithm, somewhat mechanically hand translated to Common Lisp by
    15 ;; Steven M. Haflich smh@franz.com Feb 2002.  Most of the inline comments refer to the
    16 ;; original C code.  At the time of this translation the code passes the associated Porter
    17 ;; test files.  See the function test at the end of this file.
    18 
    19 ;; This port is intended to be portable ANSI Common Lisp.  However, it has only been
    20 ;; compiled and tested with Allegro Common Lisp.  This code is offered in the hope it will
    21 ;; be useful, but with no warranty of correctness, suitability, usability, or anything
    22 ;; else.  The C implementation from which this code was derived was not reentrant, relying
    23 ;; on global variables.  This implementation corrects that.  It is intended that a word to
    24 ;; be stemmed will be in a string with fill-pointer, as this is a natural result when
    25 ;; parsing user input, web scraping, whatever.  If not, a string with fill-pointer is
    26 ;; created, but this is an efficiency hit and is here intended only for lightweight use or
    27 ;; testing.  Using some resource mechanism on these strings would be a useful improvement,
    28 ;; whether here or in the calling code.
    29 
    30 ;; This is the Porter stemming algorithm, coded up in ANSI C by the
    31 ;; author. It may be be regarded as cononical, in that it follows the
    32 ;; algorithm presented in
    33 
    34 ;; Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
    35 ;; no. 3, pp 130-137,
    36 
    37 ;; only differing from it at the points maked --DEPARTURE-- below.
    38 
    39 ;; See also http://www.tartarus.org/~martin/PorterStemmer
    40 
    41 ;; The algorithm as described in the paper could be exactly replicated
    42 ;; by adjusting the points of DEPARTURE, but this is barely necessary,
    43 ;; because (a) the points of DEPARTURE are definitely improvements, and
    44 ;; (b) no encoding of the Porter stemmer I have seen is anything like
    45 ;; as exact as this version, even with the points of DEPARTURE!
    46 
    47 ;; You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
    48 ;; 'stem' takes a list of inputs and sends the stemmed equivalent to
    49 ;; stdout.
    50 
    51 ;; The algorithm as encoded here is particularly fast.
    52 
    53 ;; Release 1
    54 
    55 ;; The main part of the stemming algorithm starts here. b is a buffer
    56 ;; holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
    57 ;; ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
    58 ;; downwards as the stemming progresses. Zero termination is not in fact
    59 ;; used in the algorithm.
    60 
    61 ;; Note that only lower case sequences are stemmed. Forcing to lower case
    62 ;; should be done before stem(...) is called.
    63 
    64 ;; cons(i) is TRUE <=> b[i] is a consonant.
    65 
    66 ;;; Common Lisp port Version 1.01
    67 
    68 ;;;
    69 ;;; Common Lisp port Version history
    70 ;;;
    71 ;;; 1.0  -- smh@franz.com Feb 2002
    72 ;;;         initial release
    73 ;;;
    74 ;;; 1.01 -- smh@franz.com 25 Apr 2004
    75 ;;;         step4 signalled error for "ion" "ions".  Thanks to Jeff Heard
    76 ;;;         for detecting this and suggesting the fix.
    77 
    78 ;;; Code:
    79 (defpackage :nlp/stem/porter
    80   (:use :cl :std :rdb)
    81   (:export :stem))
    82 
    83 (in-package :nlp/stem/porter)
    84 
    85 (defun consonantp (str i)
    86   (let ((char (char str i)))
    87     (cond ((member char '(#\a #\e #\i #\o #\u)) nil)
    88 	  ((eql char #\y)
    89 	   (if (= i 0) t (not (consonantp str (1- i)))))
    90 	  (t t))))
    91 
    92 ;; m() measures the number of consonant sequences between k0 and j. if c is
    93 ;; a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
    94 ;; presence,
    95 
    96 ;;    <c><v>       gives 0
    97 ;;    <c>vc<v>     gives 1
    98 ;;    <c>vcvc<v>   gives 2
    99 ;;    <c>vcvcvc<v> gives 3
   100 ;;    ....
   101 
   102 (defun m (str lim)
   103   (let ((n 0)
   104 	(i 0))
   105     (loop
   106       (when (>= i lim) (return-from m n))
   107       (if (not (consonantp str i)) (return nil))
   108       (incf i))
   109     (incf i)
   110     (loop
   111       (loop
   112 	(if (>= i lim) (return-from m n))
   113 	(if (consonantp str i) (return nil))
   114 	(incf i))
   115       (incf i)
   116       (incf n)
   117       (loop
   118 	(if (>= i lim) (return-from m n))
   119 	(if (not (consonantp str i)) (return nil))
   120 	(incf i))
   121       (incf i))))
   122 
   123 ;; vowelinstem() is TRUE <=> k0,...j contains a vowel
   124 
   125 (defun vowelinstem (str)
   126   (loop for i from 0 below (fill-pointer str)
   127       unless (consonantp str i) return t))
   128 
   129 ;; doublec(j) is TRUE <=> j,(j-1) contain a double consonant.
   130 
   131 (defun doublec (str i)
   132   (cond ((< i 1) nil)
   133 	((not (eql (char str i) (char str (1- i)))) nil)
   134 	(t (consonantp str i))))
   135 
   136 ;; cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
   137 ;; and also if the second c is not w,x or y. this is used when trying to
   138 ;; restore an e at the end of a short word. e.g.
   139 
   140 ;;    cav(e), lov(e), hop(e), crim(e), but
   141 ;;    snow, box, tray.
   142 
   143 (defun cvc (str lim)
   144   (decf lim)
   145   (if (or (< lim 2)
   146 	  (not (consonantp str lim))
   147 	  (consonantp str (1- lim))
   148 	  (not (consonantp str (- lim 2))))
   149       (return-from cvc nil))
   150   (if (member (char str lim) '(#\w #\x #\y)) (return-from cvc nil))
   151   t)
   152 
   153 ;; ends(s) is TRUE <=> k0,...k ends with the string s.
   154 
   155 (defun ends (str ending)
   156   (declare (string str) (simple-string ending))
   157   (let ((len1 (length str)) (len2 (length ending)))
   158     (loop
   159 	for pa downfrom (1- len1) to 0
   160 	and pb downfrom (1- len2) to 0
   161 	unless (eql (char str pa) (char ending pb))
   162 	return nil
   163 	finally (return (when (< pb 0)
   164 			  (decf (fill-pointer str) len2)
   165 			  t)))))
   166 
   167 ;; setto(s) sets (j+1),...k to the characters in the string s, readjusting k.
   168 
   169 (defun setto (str suffix)
   170   (declare (string str) (simple-string suffix))
   171   (loop for char across suffix
   172       do (vector-push-extend char str)))
   173 
   174 ;; r(s) is used further down.
   175 
   176 (defun r (str s sfp)
   177   (if (> (m str (fill-pointer str)) 0)
   178       (setto str s)
   179     (setf (fill-pointer str) sfp)))
   180 
   181 ;; step1ab() gets rid of plurals and -ed or -ing. e.g.
   182 
   183 ;;     caresses  ->  caress
   184 ;;     ponies    ->  poni
   185 ;;     ties      ->  ti
   186 ;;     caress    ->  caress
   187 ;;     cats      ->  cat
   188 
   189 ;;     feed      ->  feed
   190 ;;     agreed    ->  agree
   191 ;;     disabled  ->  disable
   192 
   193 ;;     matting   ->  mat
   194 ;;     mating    ->  mate
   195 ;;     meeting   ->  meet
   196 ;;     milling   ->  mill
   197 ;;     messing   ->  mess
   198 
   199 ;;     meetings  ->  meet
   200 
   201 (defun step1ab (str)
   202   (when (eql (char str (1- (fill-pointer str))) #\s)
   203     (cond ((ends str "sses") (incf (fill-pointer str) 2))
   204 	  ((ends str "ies")  (setto str "i"))
   205 	  ((not (eql (char str (- (fill-pointer str) 2)) #\s)) (decf (fill-pointer str)))))
   206   (cond ((ends str "eed") (if (> (m str (fill-pointer str)) 0)
   207 			      (incf (fill-pointer str) 2)
   208 			    (incf (fill-pointer str) 3)))
   209 	((let ((sfp (fill-pointer str)))
   210 	   (if (or (ends str "ed")
   211 		   (ends str "ing"))
   212 	       (if (vowelinstem str)
   213 		   t
   214 		 (progn (setf (fill-pointer str) sfp)
   215 			nil))))
   216 	 (cond ((ends str "at") (setto str "ate"))
   217 	       ((ends str "bl") (setto str "ble"))
   218 	       ((ends str "iz") (setto str "ize"))
   219 	       ((doublec str (1- (fill-pointer str)))
   220 		(unless (member (char str (1- (fill-pointer str))) '(#\l #\s #\z))
   221 		  (decf (fill-pointer str))))
   222 	       (t (if (and (= (m str (fill-pointer str)) 1)
   223 			   (cvc str (fill-pointer str)))
   224 		      (setto str "e"))))))
   225   str)
   226 
   227 ;; step1c() turns terminal y to i when there is another vowel in the stem.
   228 
   229 (defun step1c (str)
   230   (let ((saved-fill-pointer (fill-pointer str)))
   231     (when (and (ends str "y")
   232 	       (vowelinstem str))
   233 	(setf (char str (fill-pointer str)) #\i))
   234     (setf (fill-pointer str) saved-fill-pointer))
   235   str)
   236 
   237 ;; step2() maps double suffices to single ones. so -ization ( = -ize plus
   238 ;; -ation) maps to -ize etc. note that the string before the suffix must give
   239 ;; m() > 0.
   240 
   241 (defun step2 (str)
   242   (let ((sfp (fill-pointer str)))
   243     (when (> sfp 2)
   244       (block nil
   245 	(case (char str (- (length str) 2))
   246 	  (#\a (when (ends str "ational") (r str "ate"  sfp)  (return))
   247 	       (when (ends str "tional")  (r str "tion" sfp) (return)))
   248 	  (#\c (when (ends str "enci")    (r str "ence" sfp) (return))
   249 	       (when (ends str "anci")    (r str "ance" sfp) (return)))
   250 	  (#\e (when (ends str "izer")    (r str "ize"  sfp)  (return)))
   251 	  (#\l (when (ends str "bli")     (r str "ble"  sfp)  (return))
   252 	       ;; -DEPARTURE-
   253 	       ;; To match the published algorithm, replace prev line with
   254 	       ;; ((when (ends str "abli")    (r str "able" sfp) (return))
   255 	       (when (ends str "alli")    (r str "al"  sfp)   (return))
   256 	       (when (ends str "entli")   (r str "ent" sfp)  (return))
   257 	       (when (ends str "eli")     (r str "e"   sfp)    (return))
   258 	       (when (ends str "ousli")   (r str "ous" sfp)  (return)))
   259 	  (#\o (when (ends str "ization") (r str "ize" sfp)  (return))
   260 	       (when (ends str "ation")   (r str "ate" sfp)  (return))
   261 	       (when (ends str "ator")    (r str "ate" sfp)  (return)))
   262 	  (#\s (when (ends str "alism")   (r str "al"  sfp)   (return))
   263 	       (when (ends str "iveness") (r str "ive" sfp)  (return))
   264 	       (when (ends str "fulness") (r str "ful" sfp)  (return))
   265 	       (when (ends str "ousness") (r str "ous" sfp)  (return)))
   266 	  (#\t (when (ends str "aliti")   (r str "al"  sfp)   (return))
   267 	       (when (ends str "iviti")   (r str "ive" sfp)  (return))
   268 	       (when (ends str "biliti")  (r str "ble" sfp)  (return)))
   269 	  ;; -DEPARTURE-
   270 	  ;; To match the published algorithm, delete next line.
   271 	  (#\g (when (ends str "logi")    (r str "log" sfp)  (return)))))))
   272   str)
   273 
   274 ;; step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
   275 
   276 (defun step3 (str)
   277   (let ((sfp (fill-pointer str)))
   278     (block nil
   279       (case (char str (1- (length str)))
   280 	(#\e (when (ends str "icate") (r str "ic" sfp) (return))
   281 	     (when (ends str "ative") (r str "" sfp)   (return)) ; huh?
   282 	     (when (ends str "alize") (r str "al" sfp) (return)))
   283 	(#\i (when (ends str "iciti") (r str "ic" sfp) (return)))
   284 	(#\l (when (ends str "ical")  (r str "ic" sfp) (return))
   285 	     (when (ends str "ful")   (r str "" sfp)   (return))) ; huh?
   286 	(#\s (when (ends str "ness")  (r str "" sfp)   (return))) ; huh?
   287 	)))
   288   str)
   289 
   290 ;; step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
   291 
   292 (defun step4 (str)
   293   (let ((sfp (fill-pointer str)))
   294     (when (> sfp 2)			; Unnecessary?
   295       (block nil
   296 	(case (char str (- sfp 2))
   297 	  (#\a (if (ends str "al")    (return)))
   298 	  (#\c (if (ends str "ance")  (return))
   299 	       (if (ends str "ence")  (return)))
   300 	  (#\e (if (ends str "er")    (return)))
   301 	  (#\i (if (ends str "ic")    (return)))
   302 	  (#\l (if (ends str "able")  (return))
   303 	       (if (ends str "ible")  (return)))
   304 	  (#\n (if (ends str "ant")   (return))
   305 	       (if (ends str "ement") (return))
   306 	       (if (ends str "ment")  (return))
   307 	       (if (ends str "ent")   (return)))
   308 	  (#\o (if (ends str "ion")
   309 		   (let ((len (length str)))
   310 		     (if (and (> len 0)
   311 			      (let ((c (char str (1- len))))
   312 				(or (eql c #\s) (eql c #\t))))
   313 			 (return)
   314 		       (setf (fill-pointer str) sfp))))
   315 	       (if (ends str "ou")    (return))) ; takes care of -ous
   316 	  (#\s (if (ends str "ism")   (return)))
   317 	  (#\t (if (ends str "ate")   (return))
   318 	       (if (ends str "iti")   (return)))
   319 	  (#\u (if (ends str "ous")   (return)))
   320 	  (#\v (if (ends str "ive")   (return)))
   321 	  (#\z (if (ends str "ize")   (return))))
   322 	(return-from step4 str))
   323       (unless (> (m str (fill-pointer str)) 1)
   324 	(setf (fill-pointer str) sfp)))
   325     str))
   326 
   327 ;; step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
   328 
   329 (defun step5 (str)
   330   (let ((len (fill-pointer str)))
   331     (if (eql (char str (1- len)) #\e)
   332 	(let ((a (m str len)))
   333 	  (if (or (> a 1)
   334 		  (and (= a 1)
   335 		       (not (cvc str (1- len)))))
   336 	      (decf (fill-pointer str))))))
   337   (let ((len (fill-pointer str)))
   338     (if (and (eql (char str (1- len)) #\l)
   339 	     (doublec str (1- len))
   340 	     (> (m str len) 1))
   341 	(decf (fill-pointer str))))
   342   str)
   343 
   344 ;; In stem(p,i,j), p is a char pointer, and the string to be stemmed is from p[i] to p[j]
   345 ;; inclusive. Typically i is zero and j is the offset to the last character of a string,
   346 ;; (p[j+1] == '\0'). The stemmer adjusts the characters p[i] ... p[j] and returns the new
   347 ;; end-point of the string, k.  Stemming never increases word length, so i <= k <= j. To
   348 ;; turn the stemmer into a module, declare 'stem' as extern, and delete the remainder of
   349 ;; this file.
   350 
   351 (defun stem (str)
   352   (let ((len (length str)))
   353     ;; With this line, strings of length 1 or 2 don't go through the
   354     ;; stemming process, although no mention is made of this in the
   355     ;; published algorithm. Remove the line to match the published
   356     ;; algorithm.
   357     (if (<= len 2) (return-from stem str)) ; /*-DEPARTURE-*/
   358     (if (typep str 'simple-string)	; Primarily for testing.
   359 	(setf str
   360 	  (make-array len :element-type 'character
   361 		      :fill-pointer len :initial-contents str)))
   362     (step1ab str) (step1c str) (step2 str) (step3 str) (step4 str) (step5 str)
   363     str))
   364 
   365 #+never
   366 (trace step1ab step1c step2 step3 step4 step5)
   367 
   368 #+never
   369 (defun test ()				; Run against the distributed test files.
   370   (with-open-file (f1 "voc.txt")
   371     (with-open-file (f2 "output.txt")
   372       (loop as w1 = (read-line f1 nil nil)
   373 	  while w1
   374 	  as w2 = (read-line f2 nil nil)
   375 	  as w3 = (stem w1)
   376 	  if (equal w2 w3)
   377 	  count t into successes
   378 	  else count t into failures
   379 	  and do (format t "(stem ~s) => ~s wanted ~s~%" w1 w3 w2)
   380 	  finally (progn (format t "sucesses ~d failures ~d~%" successes failures)
   381 			 (return failures))))))