; XEmacs: This file contains -*-Scheme-*- source code. ;;; John David Stone ;;; Department of Mathematics and Computer Science ;;; Grinnell College ;;; stone@cs.grinnell.edu ;;; created November 28, 1999 ;;; last revised November 29, 2000 ;;; Given a string, LAST-OCCURRENCE-FUNCTION returns a procedure that takes ;;; any character and returns the rightmost position at which that ;;; character occurs in the string, or -1 if the character does not occur ;;; at all in the string. (define (last-occurrence-function str) (let ((index-table (lower-ply-natural (string-length str) (lambda (index tabula) (table-put tabula (string-ref str index) index)) (null-table)))) (lambda (ch) (handle-exceptions signal-value -1 (table-get index-table ch))))) ;;; Given a string, two indices into the string (that is, natural numbers ;;; not greater than the length of the string), and the length of the ;;; string, EITHER-ENDS-OTHER? examines two substrings -- a prefix ending ;;; at the first given index and a suffix beginning at the second -- and ;;; determines whether either of them is a suffix of the other. (define (either-ends-other? str prefix-length index len) (letrec ((matches-from? (lambda (fore aft) (or (= aft len) (and (char=? (string-ref str fore) (string-ref str aft)) (matches-from? (successor fore) (successor aft))))))) (matches-from? (max 0 (- prefix-length (- len index))) (max (- len prefix-length) index)))) ;;; Given a string STR of LEN characters, GOOD-SUFFIX-VECTOR returns a ;;; vector of LEN + 1 elements. An index into this vector represents the ;;; number of characters of STR that are not successfully matched (during a ;;; character-by-character comparison, moving from right-to-left, beginning ;;; with the last character of STR) against some segment of a text string. ;;; The element at a given index indicates how much farther to shift the ;;; pattern along the text string before attempting the next match, ;;; avoiding any shift at which the text characters that matched this time ;;; cannot possibly match. ;;; This vector can be computed from STR alone, since the charcters that ;;; were successfully matched, in addition to being found in the text ;;; string, constitute a suffix of STR. The longest proper prefix of STR ;;; that matches this suffix (in the sense expressed by EITHER-ENDS-OTHER?) ;;; should be aligned with the matching text for the next ;;; character-by-character scan. So we compute the length of that longest ;;; prefix; the desired shift is the result of subtracting that length from ;;; the length of STR. ;;; Since the null string has no non-empty prefixes, GOOD-SUFFIX-VECTOR ;;; deals with it as a special case. (VECTOR 1) is returned, since we ;;; always want to advance one position in the text after matching a null ;;; string. ;;; If STR is sufficiently long, this is not the optimal way to compute the ;;; good-suffix vector. However, it is easy to understand, easy to code ;;; correctly, and fast enough for all but the most demanding ;;; applications. (define (good-suffix-vector str) (let ((len (string-length str))) (if (zero? len) (vector 1) (generate-vector (lambda (index) (- len (greatest-satisfying (predecessor len) (lambda (prefix-length) (either-ends-other? str prefix-length index len))))) (successor len))))) ;;; Given a pattern string and a text string, UNMATCHED-PREFIX-LENGTH ;;; returns a procedure that takes a shift in the text string as its ;;; argument and returns the number of characters that precede the longest ;;; matching suffix of the pattern that matches the text at the specified ;;; shift. (define (unmatched-prefix-length pattern text) (lambda (shift) (greatest-satisfying (string-length pattern) (lambda (prefix-length) (or (zero? prefix-length) (let ((position (predecessor prefix-length))) (not (char=? (string-ref pattern position) (string-ref text (+ shift position)))))))))) ;;; Given a pattern string and a text string, BOYER-MOORE-JUMP returns a ;;; procedure that takes a shift in the text string and the length of a ;;; prefix of the pattern that does not match the text string at the given ;;; shift and returns an increment to be added to the shift. The increment ;;; is the larger of two quantities: the shift prescribed in the ;;; precomputed good-suffix vector, which brings a new, shorter prefix of ;;; the pattern into alignment with as many as possible of the characters ;;; already matched, and the shift prescribed by the last-occurrence ;;; function, which aligns the text character that fails to match with the ;;; rightmost occurrence of that character in the pattern. (define (boyer-moore-jump pattern text) (let ((gsv (good-suffix-vector pattern)) (lof (last-occurrence-function pattern)) (pattern-length (string-length pattern))) (lambda (shift unmatched) (max (vector-ref gsv unmatched) (if (zero? unmatched) 0 (- (predecessor unmatched) (lof (string-ref text (+ shift (predecessor unmatched)))))))))) ;;; BOYER-MOORE-MATCHER takes two strings, TEXT and PATTERN, and returns a ;;; set of ``valid shifts,'' that is, positions in TEXT at which copies of ;;; PATTERN begin. The initial shift of 0 aligns PATTERN with the ;;; beginning of TEXT; subsequent adjustments advance the shift in jumps (define (Boyer-Moore-matcher text pattern) (let ((upl (unmatched-prefix-length pattern text)) (jumper (boyer-moore-jump pattern text)) (maximum-shift (- (string-length text) (string-length pattern)))) ((until (apply-to-next (initial-section < maximum-shift)) (lambda (valid-shifts shift) (let ((unmatched (upl shift))) (values (if (zero? unmatched) (adjoin-to-set shift valid-shifts) valid-shifts) (+ shift (jumper shift unmatched)))))) (set) 0)))