changelog shortlog graph tags branches changeset files revisions annotate raw help

Mercurial > core / lisp/lib/nlp/stem/porter.lisp

changeset 698: 96958d3eb5b0
parent: daad2b8bb63f
author: Richard Westhaver <ellis@rwest.io>
date: Fri, 04 Oct 2024 22:04:59 -0400
permissions: -rw-r--r--
description: fixes
1 ;;; stem/porter.lisp --- Porter Stemming Algorithm
2 
3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4 ;; The software is completely free for any purpose, unless notes at
5 ;; the head of the program text indicates otherwise (which is
6 ;; rare). In any case, the notes about licensing are never more
7 ;; restrictive than the BSD License.
8 ;
9 ;; In every case where the software is not written by me (Martin
10 ;; Porter), this licensing arrangement has been endorsed by the
11 ;; contributor, and it is therefore unnecessary to ask the contributor
12 ;; again to confirm it.
13 ;
14 ;; The Porter Stemming Algorithm, somewhat mechanically hand translated to Common Lisp by
15 ;; Steven M. Haflich smh@franz.com Feb 2002. Most of the inline comments refer to the
16 ;; original C code. At the time of this translation the code passes the associated Porter
17 ;; test files. See the function test at the end of this file.
18 
19 ;; This port is intended to be portable ANSI Common Lisp. However, it has only been
20 ;; compiled and tested with Allegro Common Lisp. This code is offered in the hope it will
21 ;; be useful, but with no warranty of correctness, suitability, usability, or anything
22 ;; else. The C implementation from which this code was derived was not reentrant, relying
23 ;; on global variables. This implementation corrects that. It is intended that a word to
24 ;; be stemmed will be in a string with fill-pointer, as this is a natural result when
25 ;; parsing user input, web scraping, whatever. If not, a string with fill-pointer is
26 ;; created, but this is an efficiency hit and is here intended only for lightweight use or
27 ;; testing. Using some resource mechanism on these strings would be a useful improvement,
28 ;; whether here or in the calling code.
29 
30 ;; This is the Porter stemming algorithm, coded up in ANSI C by the
31 ;; author. It may be be regarded as cononical, in that it follows the
32 ;; algorithm presented in
33 
34 ;; Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
35 ;; no. 3, pp 130-137,
36 
37 ;; only differing from it at the points maked --DEPARTURE-- below.
38 
39 ;; See also http://www.tartarus.org/~martin/PorterStemmer
40 
41 ;; The algorithm as described in the paper could be exactly replicated
42 ;; by adjusting the points of DEPARTURE, but this is barely necessary,
43 ;; because (a) the points of DEPARTURE are definitely improvements, and
44 ;; (b) no encoding of the Porter stemmer I have seen is anything like
45 ;; as exact as this version, even with the points of DEPARTURE!
46 
47 ;; You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
48 ;; 'stem' takes a list of inputs and sends the stemmed equivalent to
49 ;; stdout.
50 
51 ;; The algorithm as encoded here is particularly fast.
52 
53 ;; Release 1
54 
55 ;; The main part of the stemming algorithm starts here. b is a buffer
56 ;; holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
57 ;; ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
58 ;; downwards as the stemming progresses. Zero termination is not in fact
59 ;; used in the algorithm.
60 
61 ;; Note that only lower case sequences are stemmed. Forcing to lower case
62 ;; should be done before stem(...) is called.
63 
64 ;; cons(i) is TRUE <=> b[i] is a consonant.
65 
66 ;;; Common Lisp port Version 1.01
67 
68 ;;;
69 ;;; Common Lisp port Version history
70 ;;;
71 ;;; 1.0 -- smh@franz.com Feb 2002
72 ;;; initial release
73 ;;;
74 ;;; 1.01 -- smh@franz.com 25 Apr 2004
75 ;;; step4 signalled error for "ion" "ions". Thanks to Jeff Heard
76 ;;; for detecting this and suggesting the fix.
77 
78 ;;; Code:
79 (defpackage :nlp/stem/porter
80  (:use :cl :std :rdb)
81  (:export :stem))
82 
83 (in-package :nlp/stem/porter)
84 
85 (defun consonantp (str i)
86  (let ((char (char str i)))
87  (cond ((member char '(#\a #\e #\i #\o #\u)) nil)
88  ((eql char #\y)
89  (if (= i 0) t (not (consonantp str (1- i)))))
90  (t t))))
91 
92 ;; m() measures the number of consonant sequences between k0 and j. if c is
93 ;; a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
94 ;; presence,
95 
96 ;; <c><v> gives 0
97 ;; <c>vc<v> gives 1
98 ;; <c>vcvc<v> gives 2
99 ;; <c>vcvcvc<v> gives 3
100 ;; ....
101 
102 (defun m (str lim)
103  (let ((n 0)
104  (i 0))
105  (loop
106  (when (>= i lim) (return-from m n))
107  (if (not (consonantp str i)) (return nil))
108  (incf i))
109  (incf i)
110  (loop
111  (loop
112  (if (>= i lim) (return-from m n))
113  (if (consonantp str i) (return nil))
114  (incf i))
115  (incf i)
116  (incf n)
117  (loop
118  (if (>= i lim) (return-from m n))
119  (if (not (consonantp str i)) (return nil))
120  (incf i))
121  (incf i))))
122 
123 ;; vowelinstem() is TRUE <=> k0,...j contains a vowel
124 
125 (defun vowelinstem (str)
126  (loop for i from 0 below (fill-pointer str)
127  unless (consonantp str i) return t))
128 
129 ;; doublec(j) is TRUE <=> j,(j-1) contain a double consonant.
130 
131 (defun doublec (str i)
132  (cond ((< i 1) nil)
133  ((not (eql (char str i) (char str (1- i)))) nil)
134  (t (consonantp str i))))
135 
136 ;; cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
137 ;; and also if the second c is not w,x or y. this is used when trying to
138 ;; restore an e at the end of a short word. e.g.
139 
140 ;; cav(e), lov(e), hop(e), crim(e), but
141 ;; snow, box, tray.
142 
143 (defun cvc (str lim)
144  (decf lim)
145  (if (or (< lim 2)
146  (not (consonantp str lim))
147  (consonantp str (1- lim))
148  (not (consonantp str (- lim 2))))
149  (return-from cvc nil))
150  (if (member (char str lim) '(#\w #\x #\y)) (return-from cvc nil))
151  t)
152 
153 ;; ends(s) is TRUE <=> k0,...k ends with the string s.
154 
155 (defun ends (str ending)
156  (declare (string str) (simple-string ending))
157  (let ((len1 (length str)) (len2 (length ending)))
158  (loop
159  for pa downfrom (1- len1) to 0
160  and pb downfrom (1- len2) to 0
161  unless (eql (char str pa) (char ending pb))
162  return nil
163  finally (return (when (< pb 0)
164  (decf (fill-pointer str) len2)
165  t)))))
166 
167 ;; setto(s) sets (j+1),...k to the characters in the string s, readjusting k.
168 
169 (defun setto (str suffix)
170  (declare (string str) (simple-string suffix))
171  (loop for char across suffix
172  do (vector-push-extend char str)))
173 
174 ;; r(s) is used further down.
175 
176 (defun r (str s sfp)
177  (if (> (m str (fill-pointer str)) 0)
178  (setto str s)
179  (setf (fill-pointer str) sfp)))
180 
181 ;; step1ab() gets rid of plurals and -ed or -ing. e.g.
182 
183 ;; caresses -> caress
184 ;; ponies -> poni
185 ;; ties -> ti
186 ;; caress -> caress
187 ;; cats -> cat
188 
189 ;; feed -> feed
190 ;; agreed -> agree
191 ;; disabled -> disable
192 
193 ;; matting -> mat
194 ;; mating -> mate
195 ;; meeting -> meet
196 ;; milling -> mill
197 ;; messing -> mess
198 
199 ;; meetings -> meet
200 
201 (defun step1ab (str)
202  (when (eql (char str (1- (fill-pointer str))) #\s)
203  (cond ((ends str "sses") (incf (fill-pointer str) 2))
204  ((ends str "ies") (setto str "i"))
205  ((not (eql (char str (- (fill-pointer str) 2)) #\s)) (decf (fill-pointer str)))))
206  (cond ((ends str "eed") (if (> (m str (fill-pointer str)) 0)
207  (incf (fill-pointer str) 2)
208  (incf (fill-pointer str) 3)))
209  ((let ((sfp (fill-pointer str)))
210  (if (or (ends str "ed")
211  (ends str "ing"))
212  (if (vowelinstem str)
213  t
214  (progn (setf (fill-pointer str) sfp)
215  nil))))
216  (cond ((ends str "at") (setto str "ate"))
217  ((ends str "bl") (setto str "ble"))
218  ((ends str "iz") (setto str "ize"))
219  ((doublec str (1- (fill-pointer str)))
220  (unless (member (char str (1- (fill-pointer str))) '(#\l #\s #\z))
221  (decf (fill-pointer str))))
222  (t (if (and (= (m str (fill-pointer str)) 1)
223  (cvc str (fill-pointer str)))
224  (setto str "e"))))))
225  str)
226 
227 ;; step1c() turns terminal y to i when there is another vowel in the stem.
228 
229 (defun step1c (str)
230  (let ((saved-fill-pointer (fill-pointer str)))
231  (when (and (ends str "y")
232  (vowelinstem str))
233  (setf (char str (fill-pointer str)) #\i))
234  (setf (fill-pointer str) saved-fill-pointer))
235  str)
236 
237 ;; step2() maps double suffices to single ones. so -ization ( = -ize plus
238 ;; -ation) maps to -ize etc. note that the string before the suffix must give
239 ;; m() > 0.
240 
241 (defun step2 (str)
242  (let ((sfp (fill-pointer str)))
243  (when (> sfp 2)
244  (block nil
245  (case (char str (- (length str) 2))
246  (#\a (when (ends str "ational") (r str "ate" sfp) (return))
247  (when (ends str "tional") (r str "tion" sfp) (return)))
248  (#\c (when (ends str "enci") (r str "ence" sfp) (return))
249  (when (ends str "anci") (r str "ance" sfp) (return)))
250  (#\e (when (ends str "izer") (r str "ize" sfp) (return)))
251  (#\l (when (ends str "bli") (r str "ble" sfp) (return))
252  ;; -DEPARTURE-
253  ;; To match the published algorithm, replace prev line with
254  ;; ((when (ends str "abli") (r str "able" sfp) (return))
255  (when (ends str "alli") (r str "al" sfp) (return))
256  (when (ends str "entli") (r str "ent" sfp) (return))
257  (when (ends str "eli") (r str "e" sfp) (return))
258  (when (ends str "ousli") (r str "ous" sfp) (return)))
259  (#\o (when (ends str "ization") (r str "ize" sfp) (return))
260  (when (ends str "ation") (r str "ate" sfp) (return))
261  (when (ends str "ator") (r str "ate" sfp) (return)))
262  (#\s (when (ends str "alism") (r str "al" sfp) (return))
263  (when (ends str "iveness") (r str "ive" sfp) (return))
264  (when (ends str "fulness") (r str "ful" sfp) (return))
265  (when (ends str "ousness") (r str "ous" sfp) (return)))
266  (#\t (when (ends str "aliti") (r str "al" sfp) (return))
267  (when (ends str "iviti") (r str "ive" sfp) (return))
268  (when (ends str "biliti") (r str "ble" sfp) (return)))
269  ;; -DEPARTURE-
270  ;; To match the published algorithm, delete next line.
271  (#\g (when (ends str "logi") (r str "log" sfp) (return)))))))
272  str)
273 
274 ;; step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
275 
276 (defun step3 (str)
277  (let ((sfp (fill-pointer str)))
278  (block nil
279  (case (char str (1- (length str)))
280  (#\e (when (ends str "icate") (r str "ic" sfp) (return))
281  (when (ends str "ative") (r str "" sfp) (return)) ; huh?
282  (when (ends str "alize") (r str "al" sfp) (return)))
283  (#\i (when (ends str "iciti") (r str "ic" sfp) (return)))
284  (#\l (when (ends str "ical") (r str "ic" sfp) (return))
285  (when (ends str "ful") (r str "" sfp) (return))) ; huh?
286  (#\s (when (ends str "ness") (r str "" sfp) (return))) ; huh?
287  )))
288  str)
289 
290 ;; step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
291 
292 (defun step4 (str)
293  (let ((sfp (fill-pointer str)))
294  (when (> sfp 2) ; Unnecessary?
295  (block nil
296  (case (char str (- sfp 2))
297  (#\a (if (ends str "al") (return)))
298  (#\c (if (ends str "ance") (return))
299  (if (ends str "ence") (return)))
300  (#\e (if (ends str "er") (return)))
301  (#\i (if (ends str "ic") (return)))
302  (#\l (if (ends str "able") (return))
303  (if (ends str "ible") (return)))
304  (#\n (if (ends str "ant") (return))
305  (if (ends str "ement") (return))
306  (if (ends str "ment") (return))
307  (if (ends str "ent") (return)))
308  (#\o (if (ends str "ion")
309  (let ((len (length str)))
310  (if (and (> len 0)
311  (let ((c (char str (1- len))))
312  (or (eql c #\s) (eql c #\t))))
313  (return)
314  (setf (fill-pointer str) sfp))))
315  (if (ends str "ou") (return))) ; takes care of -ous
316  (#\s (if (ends str "ism") (return)))
317  (#\t (if (ends str "ate") (return))
318  (if (ends str "iti") (return)))
319  (#\u (if (ends str "ous") (return)))
320  (#\v (if (ends str "ive") (return)))
321  (#\z (if (ends str "ize") (return))))
322  (return-from step4 str))
323  (unless (> (m str (fill-pointer str)) 1)
324  (setf (fill-pointer str) sfp)))
325  str))
326 
327 ;; step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
328 
329 (defun step5 (str)
330  (let ((len (fill-pointer str)))
331  (if (eql (char str (1- len)) #\e)
332  (let ((a (m str len)))
333  (if (or (> a 1)
334  (and (= a 1)
335  (not (cvc str (1- len)))))
336  (decf (fill-pointer str))))))
337  (let ((len (fill-pointer str)))
338  (if (and (eql (char str (1- len)) #\l)
339  (doublec str (1- len))
340  (> (m str len) 1))
341  (decf (fill-pointer str))))
342  str)
343 
344 ;; In stem(p,i,j), p is a char pointer, and the string to be stemmed is from p[i] to p[j]
345 ;; inclusive. Typically i is zero and j is the offset to the last character of a string,
346 ;; (p[j+1] == '\0'). The stemmer adjusts the characters p[i] ... p[j] and returns the new
347 ;; end-point of the string, k. Stemming never increases word length, so i <= k <= j. To
348 ;; turn the stemmer into a module, declare 'stem' as extern, and delete the remainder of
349 ;; this file.
350 
351 (defun stem (str)
352  (let ((len (length str)))
353  ;; With this line, strings of length 1 or 2 don't go through the
354  ;; stemming process, although no mention is made of this in the
355  ;; published algorithm. Remove the line to match the published
356  ;; algorithm.
357  (if (<= len 2) (return-from stem str)) ; /*-DEPARTURE-*/
358  (if (typep str 'simple-string) ; Primarily for testing.
359  (setf str
360  (make-array len :element-type 'character
361  :fill-pointer len :initial-contents str)))
362  (step1ab str) (step1c str) (step2 str) (step3 str) (step4 str) (step5 str)
363  str))
364 
365 #+never
366 (trace step1ab step1c step2 step3 step4 step5)
367 
368 #+never
369 (defun test () ; Run against the distributed test files.
370  (with-open-file (f1 "voc.txt")
371  (with-open-file (f2 "output.txt")
372  (loop as w1 = (read-line f1 nil nil)
373  while w1
374  as w2 = (read-line f2 nil nil)
375  as w3 = (stem w1)
376  if (equal w2 w3)
377  count t into successes
378  else count t into failures
379  and do (format t "(stem ~s) => ~s wanted ~s~%" w1 w3 w2)
380  finally (progn (format t "sucesses ~d failures ~d~%" successes failures)
381  (return failures))))))