;;; @Package soundex.scm ;;; @Subtitle Soundex Index Keying in Scheme ;;; @HomePage http://www.neilvandyke.org/soundex-scm/ ;;; @Author Neil W. Van Dyke ;;; @AuthorEmail neil@@neilvandyke.org ;;; @Version 0.2 ;;; @Date 2 August 2004 (define soundex-internal:version "0.2") ;; $Id: soundex.scm,v 1.39 2004/08/02 23:26:46 neil Exp $ ;;; @legal ;;; Copyright @copyright{} 2004 Neil W. Van Dyke. This program is Free ;;; Software; you can redistribute it and/or modify it under the terms of the ;;; GNU Lesser General Public License as published by the Free Software ;;; Foundation; either version 2.1 of the License, or (at your option) any ;;; later version. This program is distributed in the hope that it will be ;;; useful, but without any warranty; without even the implied warranty of ;;; merchantability or fitness for a particular purpose. See the GNU Lesser ;;; General Public License [LGPL] for more details. ;;; @end legal ;;; @section Introduction ;;; This is an implementation in Scheme of the Soundex indexing hash function ;;; as specified somewhat loosely by US National Archives and Records ;;; Administration (NARA) publication [Soundex], and verified empirically ;;; against test cases from various sources. Both the current NARA function ;;; and the older version with different handling of `H' and `W' are supported. ;;; Additionally, a nonstandard prefix guessing function permits multiple ;;; Soundex keys to be generated from a string, increasing recall. ;;; ;;; This library should work under any R5RS Scheme implementation for which ;;; @code{char->integer} yields ASCII values. ;;; @section Characters, Ordinals, and Codes ;;; To facilitate possible future support of other input character sets, this ;;; library employs a @dfn{character ordinal} abstract representation of the ;;; letters used by Soundex. The ordinal value is an integer from 0 to ;;; 25---corresponding to the 26 letters `A' through `Z', respectively---and ;;; can be used for fast mapping via vectors. Most applications need not be ;;; aware of this. ;;; @defproc soundex-ordinal chr ;;; ;;; Yields the Soundex ordinal value of character @var{chr}, of @var{#f} if the ;;; character is not considered a letter. ;;; ;;; @lisp ;;; (soundex-ordinal #\a) @result{} 0 ;;; (soundex-ordinal #\A) @result{} 0 ;;; (soundex-ordinal #\Z) @result{} 25 ;;; (soundex-ordinal #\3) @result{} #f ;;; (soundex-ordinal #\.) @result{} #f ;;; @end lisp (define (soundex-ordinal chr) (let ((x (char->integer chr))) (cond ((< x 65) #f) ((< x 91) (- x 65)) ((< x 97) #f) ((< x 123) (- x 97)) (else #f)))) ;;; @defproc soundex-ordinal->char ord ;;; ;;; Yields the upper-case letter character that corresponds to the character ;;; ordinal value @var{ord}. For example: ;;; ;;; @lisp ;;; (soundex-ordinal->char (soundex-ordinal #\a)) @result{} #\A ;;; @end lisp ;;; ;;; Note that an @code{#f} value as a result of applying @code{soundex-ordinal} ;;; is @emph{not} an ordinal value, and is not mapped to a character by ;;; @code{soundex-ordinal->char}. For example: ;;; ;;; @lisp ;;; (soundex-ordinal->char (soundex-ordinal #\')) @error{} ;;; @end lisp (define soundex-ordinal->char (let ((letters '#(#\A #\B #\C #\D #\E #\F #\G #\H #\I #\J #\K #\L #\M #\N #\O #\P #\Q #\R #\S #\T #\U #\V #\W #\X #\Y #\Z))) (lambda (ord) (vector-ref letters ord)))) ;;; @defproc soundex-ordinal->soundex-code ord ;;; ;;; Yields a library-specific Soundex code for character ordinal @var{ord}. ;;; ;;; @lisp ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\a)) @result{} aeiou ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\c)) @result{} #\2 ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\N)) @result{} #\5 ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\w)) @result{} hw ;;; (soundex-ordinal->soundex-code (soundex-ordinal #\y)) @result{} y ;;; @end lisp (define soundex-ordinal->soundex-code (let ((code-vector '#(aeiou #\1 #\2 #\3 aeiou #\1 #\2 hw aeiou #\2 #\2 #\4 #\5 #\5 aeiou #\1 #\2 #\6 #\2 #\3 aeiou #\1 hw #\2 y #\2))) (lambda (ord) (if ord (vector-ref code-vector ord) #f)))) ;;; @defproc char->soundex-code chr ;;; ;;; Yields a library-specific Soundex code for character @var{chr}. This is ;;; equivalent to: @code{(soundex-ordinal->soundex-code (soundex-ordinal ;;; @var{chr}))}. (define (char->soundex-code chr) (soundex-ordinal->soundex-code (soundex-ordinal chr))) ;;; @section Hashing ;;; Soundex hashes of strings can be generated with @code{soundex-nara}, ;;; @code{soundex-old}, and @code{soundex}. ;;; @defproc soundex/narahw/start str narahw? start ;;; ;;; This is an internal procedure. ;;; ;;; @lisp ;;; (soundex/narahw/start "van Dam" #t 4) @result{} "D500" ;;; (soundex/narahw/start ".0,!" #t 0) @result{} #f ;;; @end lisp (define (soundex/narahw/start str narahw? start) (let ((len (string-length str))) (let find-first-alpha ((i start)) (if (>= i len) #f (let* ((ord (soundex-ordinal (string-ref str i)))) (if ord (let ((result (make-string 4 #\0)) (result-used 1)) (string-set! result 0 (soundex-ordinal->char ord)) (let scan ((i (+ 1 i)) (prior-code (soundex-ordinal->soundex-code ord))) (if (>= i len) result (let ((code (char->soundex-code (string-ref str i)))) (case code ((aeiou y) (scan (+ 1 i) code)) ((hw) (scan (+ 1 i) (if narahw? prior-code code))) ((#\1 #\2 #\3 #\4 #\5 #\6) (if (eqv? code prior-code) (scan (+ 1 i) prior-code) (begin (string-set! result result-used code) (if (= result-used 3) result (begin (set! result-used (+ 1 result-used)) (scan (+ 1 i) code)))))) (else (scan (+ 1 i) #f))))))) (find-first-alpha (+ 1 i)))))))) ;;; @defproc soundex-nara str ;;; @defprocx soundex-old str ;;; @defprocx soundex str ;;; ;;; Yields a Soundex hash key of string @var{str}, or @code{#f} if not even an ;;; initial letter could be found. @code{soundex-nara} generates NARA hashes, ;;; and @code{soundex-old} generates older-style hashes. @code{soundex} is an ;;; alias for @code{soundex-nara}. ;;; ;;; @lisp ;;; (soundex-nara "Ashcraft") @result{} "A261" ;;; (soundex-old "Ashcraft") @result{} "A226" ;;; (soundex "Ashcraft") @result{} "A261" ;;; (soundex "") @result{} #f ;;; @end lisp (define (soundex-nara str) (soundex/narahw/start str #t 0)) (define (soundex-old str) (soundex/narahw/start str #f 0)) (define soundex soundex-nara) ;;; @section Prefixing ;;; Multiple Soundex hashes from a single string can be generated by ;;; @code{soundex-nara/prefixing}, @code{soundex-old/prefixing}, and ;;; @code{soundex/p}, which consider the string with and without various common ;;; surname prefixes. ;;; @defproc soundex-prefix-starts str ;;; ;;; Yields a list of Soundex start points in string @var{str}, as character ;;; index integers, for making hash keys with and without prefixes. A prefix ;;; must be followed by at least two letters, although they can be interspersed ;;; with non-letter characters. The exact behavior of this function is subject ;;; to change in future versions of this library. ;;; ;;; @lisp ;;; (soundex-prefix-starts "Smith") @result{} (0) ;;; (soundex-prefix-starts " Jones") @result{} (2) ;;; (soundex-prefix-starts "vanderlinden") @result{} (0 3 6) ;;; (soundex-prefix-starts "van der linden") @result{} (0 3 7) ;;; (soundex-prefix-starts "") @result{} () ;;; (soundex-prefix-starts "123") @result{} () ;;; (soundex-prefix-starts "dea") @result{} (0) ;;; (soundex-prefix-starts "dea ") @result{} (0) ;;; (soundex-prefix-starts "dean") @result{} (0) ;;; (soundex-prefix-starts "delasol") @result{} (0 2 3 4) ;;; @end lisp (define (soundex-prefix-starts str) ;; TODO: Maybe someday find a really elegant way to integrate this into the ;; coding pass, or cache the ordinals. At the same time, make it ;; data-driven, so that it's easier to make a prefixing constructor ;; from a user-provided list of prefixes. (letrec ((len (string-length str)) (i 0) (ord #f) (next-ord (lambda () (if (= i len) 'end (begin (set! ord (soundex-ordinal (string-ref str i))) (set! i (+ 1 i)) (or ord (next-ord)))))) (trailed? (lambda () (let ((saved-i i) (result (let loop ((needed 2)) (if (> needed 0) (case (next-ord) ((end) #f) ((#f) (loop needed)) (else (loop (- needed 1)))) #t)))) (set! i saved-i) result)))) (let find-first () (case (next-ord) ((end) '()) ((#f) (find-first)) (else ;; A=0 B=1 C=2 D=3 E=4 F=5 G=6 H=7 I=8 J=9 K=10 L=11 M=12 ;; N=13 O=14 P=15 Q=16 R=17 S=18 T=19 U=20 V=21 W=22 X=23 Y=24 Z=25 (cons (- i 1) (case ord ((2) ;; C (if (and (eq? (next-ord) 14) ;; (C)O (eq? (next-ord) 13) ;; (CO)N (trailed?)) (list i) '())) ((3) ;; D (case (next-ord) ((4) ;; (D)E (if (trailed?) (cons i (case (next-ord) ((11) ;; (DE)L (if (trailed?) (cons i (if (and (eq? (next-ord) 0) ;; (DEL)A (trailed?)) (list i) '())) '())) ((18) ;; (DE)S (if (trailed?) (list i) '())) (else '()))) '())) ((8 20) ;; (D)I, (D)U (if (trailed?) (list i) '())) (else '()))) ((11) ;; L (case (next-ord) ((0 4) ;; (L)A, (L)E (if (trailed?) (list i) '())) (else '()))) ((21) ;; V (case (next-ord) ((0 14) ;; (V)A, (V)O (if (eq? (next-ord) 13) ;; (V*)N (cons i (if (and (eq? (next-ord) 3) ;; (V*N)D (eq? (next-ord) 4) ;; (V*ND)E (trailed?)) (case (next-ord) ((13 17) ;; (V*NDE)N, (V*NDE)R (if (trailed?) (list i) '())) (else '())) '())) '())) (else '()))) (else '())))))))) ;;; @defproc soundex/narahw str narahw? ;;; ;;; This is an internal procedure. (define (soundex/prefixing/narahw str narahw?) (let ((result '())) (for-each (lambda (start) (let ((sx (soundex/narahw/start str narahw? start))) (and sx (not (member sx result)) (set! result (cons sx result))))) (soundex-prefix-starts str)) (reverse result))) ;;; @defproc soundex-nara/prefixing str ;;; @defprocx soundex-old/prefixing str ;;; @defprocx soundex/p str ;;; ;;; Yields a list of zero or more Soundex hash keys from string @var{str}, ;;; based on the whole string and the string with various prefixes skipped. ;;; All elements of the list are mutually unique. ;;; @code{soundex-nara/prefixing} generates NARA hashes, and ;;; @code{soundex-old/prefixing} generates older-style hashes. ;;; @code{soundex/p} is an alias for @code{soundex-nara/prefixing}. ;;; ;;; @lisp ;;; (soundex/p "Van Damme") @result{} ("V535" "D500") ;;; (soundex/p "vanvoom") @result{} ("V515" "V500") ;;; (soundex/p "vanvanvan") @result{} ("V515") ;;; (soundex/p "DeLaSol") @result{} ("D424" "L240" "A240" "S400") ;;; (soundex/p "") @result{} () ;;; @end lisp (define (soundex-nara/prefixing str) (soundex/prefixing/narahw str #t)) (define (soundex-old/prefixing str) (soundex/prefixing/narahw str #f)) (define soundex/p soundex-nara/prefixing) ;; TODO: Use a portable test suite tool, and add a comprehensive set of test ;; cases. ;; ;; (letrec ((test (lambda (input expected) ;; (write (cons 'soundex/p (list input))) ;; (display " => ") ;; (let ((result (soundex/p input))) ;; (write result) ;; (if (equal? result expected) ;; (display " ; OK") ;; (begin (display " ; FAILED! ") ;; (write expected))) ;; (newline))))) ;; (newline) ;; (test "Allricht" '("A462")) ;; (test "Ashcraft" '("A261")) ;; (test "Auerbach" '("A612")) ;; (test "BALDRICK" '("B436")) ;; (test "BATEMAN" '("B355")) ;; (test "BLACKADDER" '("B423")) ;; (test "Cook" '("C200")) ;; (test "DEAN" '("D500")) ;; (test "DeSmet" '("D253")) ;; (test "Devanter" '("D153")) ;; (test "Eberhard" '("E166")) ;; (test "Ellery" '("E460")) ;; (test "Engebrethson" '("E521")) ;; (test "Euler" '("E460")) ;; (test "FRIEDLANDER" '("F634")) ;; (test "Gauss" '("G200")) ;; (test "Ghosh" '("G200")) ;; (test "Gutierrez" '("G362")) ;; (test "HOLMES" '("H452")) ;; (test "Hanselmann" '("H524")) ;; (test "Heilbronn" '("H416")) ;; (test "Heimbach" '("H512")) ;; (test "Henzelmann" '("H524")) ;; (test "Hilbert" '("H416")) ;; (test "Hildebrand" '("H431")) ;; (test "JOHNSON" '("J525")) ;; (test "Jackson" '("J250")) ;; (test "Jansen" '("J525")) ;; (test "Janzen" '("J525")) ;; (test "Johanson" '("J525")) ;; (test "Johnson" '("J525")) ;; (test "Kant" '("K530")) ;; (test "Kavanagh" '("K152")) ;; (test "Knuth" '("K530")) ;; (test "Ladd" '("L300")) ;; (test "Lee" '("L000")) ;; (test "Leigh" '("L200")) ;; (test "Lind" '("L530")) ;; (test "Lissajous" '("L222")) ;; (test "Lloyd" '("L300")) ;; (test "Lukaschowsky" '("L222")) ;; (test "Lukasiewicz" '("L222")) ;; (test "METSEKER" '("M322")) ;; (test "METSKER" '("M326")) ;; (test "MEYER" '("M600")) ;; (test "MUSTERMANN" '("M236")) ;; (test "McDonnell" '("M235")) ;; (test "McGee" '("M200")) ;; (test "Michael" '("M240")) ;; (test "Miller" '("M460")) ;; (test "Moskovitz" '("M213")) ;; (test "Moskowitz" '("M232")) ;; (test "O'Brien" '("O165")) ;; (test "Opnian" '("O155")) ;; (test "Oppenheimer" '("O155")) ;; (test "PALMER" '("P456")) ;; (test "Peters" '("P362")) ;; (test "Peterson" '("P362")) ;; (test "Pfister" '("P236")) ;; (test "RANDLE" '("R534")) ;; (test "Riedemanas" '("R355")) ;; (test "SINGLETON" '("S524")) ;; (test "SMITHSON" '("S532")) ;; (test "Smith" '("S530")) ;; (test "Swhgler" '("S460")) ;; (test "Tymczak" '("T522")) ;; (test "Uhrbach" '("U612")) ;; ;;(test "Van Devanter" '("V531")) ;; (test "VanDeusen" '("V532" "D250")) ;; (test "VanSmith" '("V525" "S530")) ;; (test "WINEDALE" '("W534")) ;; (test "WOLFE" '("W410")) ;; (test "Washington" '("W252")) ;; (test "Wu" '("W000")) ;; (test "Zita" '("Z300")) ;; (test "Zitzmeinn" '("Z325")) ;; (test "vanDevanter" '("V531" "D153")) ;; (test "vanvanvan" '("V515")) ;; ;; ;; TODO: Test old-style Soundex too. One difference is "Ashcraft". ;; ;; ;; TODO: Apparently there's some kind of Soundex implementation in SLIB. ;; ;; Monte-Carlo test against that? ;; ;; (newline)) ;;; @unnumberedsec History ;;; @table @asis ;;; ;;; @item Version 0.2 --- 2 August 2004 ;;; Minor documentation change. Version frozen for PLaneT packaging. ;;; ;;; @item Version 0.1 --- 10 May 2004 ;;; First release. ;;; ;;; @end table ;;; @unnumberedsec References ;;; @table @asis ;;; ;;; @item [GIL-55] ;;; US National Archives and Records Administration, ``Using the Census ;;; Soundex,'' General Information Leaflet 55, 1995. ;;; ;;; @item [LGPL] ;;; Free Software Foundation, ``GNU Lesser General Public License,'' Version ;;; 2.1, February 1999, 59 Temple Place, Suite 330, Boston, MA 02111-1307 ;;; USA.@* ;;; @uref{http://www.gnu.org/copyleft/lesser.html} ;;; ;;; @item [Soundex] ;;; US National Archives and Records Administration, ``The Soundex Indexing ;;; System,'' 19 February 2000.@* ;;; @uref{http://www.archives.gov/research_room/genealogy/census/soundex.html} ;;; ;;; @end table