Repository: antzucaro/matchr
Branch: master
Commit: 7bed6ef61ef9
Files: 28
Total size: 56.5 KB
Directory structure:
gitextract_yrde_zzu/
├── COPYING.txt
├── README.md
├── damerau_levenshtein.go
├── damerau_levenshtein_test.go
├── go.mod
├── hamming.go
├── hamming_test.go
├── jarowinkler.go
├── jarowinkler_test.go
├── levenshtein.go
├── levenshtein_test.go
├── longestcommonsubsequence.go
├── longestcommonsubsequence_test.go
├── metaphone.go
├── metaphone_test.go
├── nysiis.go
├── nysiis_test.go
├── osa.go
├── osa_test.go
├── phonex.go
├── phonex_test.go
├── runestring.go
├── smithwaterman.go
├── smithwaterman_test.go
├── soundex.go
├── soundex_test.go
├── utf8.go
└── util.go
================================================
FILE CONTENTS
================================================
================================================
FILE: COPYING.txt
================================================
Matchr: an approximate string matching library for the Go programming language
Copyright (C) 2013-2014 Ant Zucaro
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
You can contact Ant Zucaro at azucaro at gmail dot com.
================================================
FILE: README.md
================================================
# matchr
[](https://pkg.go.dev/github.com/antzucaro/matchr)
An approximate string matching library for the [Go programming language](http://www.golang.org).
## Rationale
Data used in record linkage can often be of dubious quality. Typographical
errors or changing data elements (to name a few things) make establishing similarity between two sets of data
difficult. Rather than use exact string comparison in such situations, it is
vital to have a means to identify how similar two strings are. Similarity functions can cater
to certain data sets in order to make better matching decisions. The matchr library provides
several of these similarity functions.
================================================
FILE: damerau_levenshtein.go
================================================
package matchr
// DamerauLevenshtein computes the Damerau-Levenshtein distance between two
// strings. The returned value - distance - is the number of insertions,
// deletions, substitutions, and transpositions it takes to transform one
// string (s1) into another (s2). Each step in the transformation "costs"
// one distance point. It is similar to the Optimal String Alignment,
// algorithm, but is more complex because it allows multiple edits on
// substrings.
//
// This implementation is based off of the one found on Wikipedia at
// http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions
// as well as KevinStern's Java implementation found at
// https://github.com/KevinStern/software-and-algorithms.
func DamerauLevenshtein(s1 string, s2 string) (distance int) {
// index by code point, not byte
r1 := []rune(s1)
r2 := []rune(s2)
// the maximum possible distance
inf := len(r1) + len(r2)
// if one string is blank, we needs insertions
// for all characters in the other one
if len(r1) == 0 {
return len(r2)
}
if len(r2) == 0 {
return len(r1)
}
// construct the edit-tracking matrix
matrix := make([][]int, len(r1))
for i := range matrix {
matrix[i] = make([]int, len(r2))
}
// seen characters
seenRunes := make(map[rune]int)
if r1[0] != r2[0] {
matrix[0][0] = 1
}
seenRunes[r1[0]] = 0
for i := 1; i < len(r1); i++ {
deleteDist := matrix[i-1][0] + 1
insertDist := (i+1)*1 + 1
var matchDist int
if r1[i] == r2[0] {
matchDist = i
} else {
matchDist = i + 1
}
matrix[i][0] = min(min(deleteDist, insertDist), matchDist)
}
for j := 1; j < len(r2); j++ {
deleteDist := (j + 1) * 2
insertDist := matrix[0][j-1] + 1
var matchDist int
if r1[0] == r2[j] {
matchDist = j
} else {
matchDist = j + 1
}
matrix[0][j] = min(min(deleteDist, insertDist), matchDist)
}
for i := 1; i < len(r1); i++ {
var maxSrcMatchIndex int
if r1[i] == r2[0] {
maxSrcMatchIndex = 0
} else {
maxSrcMatchIndex = -1
}
for j := 1; j < len(r2); j++ {
swapIndex, ok := seenRunes[r2[j]]
jSwap := maxSrcMatchIndex
deleteDist := matrix[i-1][j] + 1
insertDist := matrix[i][j-1] + 1
matchDist := matrix[i-1][j-1]
if r1[i] != r2[j] {
matchDist += 1
} else {
maxSrcMatchIndex = j
}
// for transpositions
var swapDist int
if ok && jSwap != -1 {
iSwap := swapIndex
var preSwapCost int
if iSwap == 0 && jSwap == 0 {
preSwapCost = 0
} else {
preSwapCost = matrix[maxI(0, iSwap-1)][maxI(0, jSwap-1)]
}
swapDist = i + j + preSwapCost - iSwap - jSwap - 1
} else {
swapDist = inf
}
matrix[i][j] = min(min(min(deleteDist, insertDist), matchDist), swapDist)
}
seenRunes[r1[i]] = i
}
return matrix[len(r1)-1][len(r2)-1]
}
================================================
FILE: damerau_levenshtein_test.go
================================================
package matchr
import "testing"
var damlevtests = []struct {
s1 string
s2 string
dist int
}{
// insertion
{"car", "cars", 1},
// substitution
{"library", "librari", 1},
// deletion
{"library", "librar", 1},
// transposition
{"library", "librayr", 1},
// one empty, left
{"", "library", 7},
// one empty, right
{"library", "", 7},
// two empties
{"", "", 0},
// unicode stuff!
{"Schüßler", "Schübler", 1},
{"Schüßler", "Schußler", 1},
{"Schüßler", "Schüßler", 0},
{"Schßüler", "Schüßler", 1},
{"Schüßler", "Schüler", 1},
{"Schüßler", "Schüßlers", 1},
// difference between DL and OSA. This is DL, so it should be 2.
{"ca", "abc", 2},
}
// Damerau-Levenshtein
func TestDamerauLevenshtein(t *testing.T) {
for _, tt := range damlevtests {
dist := DamerauLevenshtein(tt.s1, tt.s2)
if dist != tt.dist {
t.Errorf("DamerauLevenshtein('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
}
}
}
================================================
FILE: go.mod
================================================
module github.com/antzucaro/matchr
go 1.13
================================================
FILE: hamming.go
================================================
package matchr
import "errors"
// Hamming computes the Hamming distance between two equal-length strings.
// This is the number of times the two strings differ between characters at
// the same index. This implementation is based off of the algorithm
// description found at http://en.wikipedia.org/wiki/Hamming_distance.
func Hamming(s1 string, s2 string) (distance int, err error) {
// index by code point, not byte
r1 := []rune(s1)
r2 := []rune(s2)
if len(r1) != len(r2) {
err = errors.New("Hamming distance of different sized strings.")
return
}
for i, v := range r1 {
if r2[i] != v {
distance += 1
}
}
return
}
================================================
FILE: hamming_test.go
================================================
package matchr
import "testing"
var hamtests = []struct {
s1 string
s2 string
dist int
err bool
}{
{"", "", 0, false},
{"cat", "cat", 0, false},
{"car", "cat", 1, false},
{"tar", "car", 1, false},
{"xyz", "zyx", 2, false},
{"wxyz", "zyx", 0, true},
{"Schüßler", "Schübler", 1, false},
{"Schüßler", "Schußler", 1, false},
}
// Hamming Distance
func TestHamming(t *testing.T) {
for _, tt := range hamtests {
dist, err := Hamming(tt.s1, tt.s2)
if dist != tt.dist {
t.Errorf("Hamming('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
}
if tt.err && err == nil {
t.Errorf("Hamming('%s', '%s') should throw an error", tt.s1, tt.s2)
}
}
}
================================================
FILE: jarowinkler.go
================================================
package matchr
func jaroWinklerBase(s1 string, s2 string,
longTolerance bool, winklerize bool) (distance float64) {
// index by code point, not byte
r1 := []rune(s1)
r2 := []rune(s2)
r1Length := len(r1)
r2Length := len(r2)
if r1Length == 0 || r2Length == 0 {
return
}
minLength := 0
if r1Length > r2Length {
minLength = r1Length
} else {
minLength = r2Length
}
searchRange := minLength
searchRange = (searchRange / 2) - 1
if searchRange < 0 {
searchRange = 0
}
var lowLim, hiLim, transCount, commonChars int
var i, j, k int
r1Flag := make([]bool, r1Length+1)
r2Flag := make([]bool, r2Length+1)
// find the common chars within the acceptable range
commonChars = 0
for i, _ = range r1 {
if i >= searchRange {
lowLim = i - searchRange
} else {
lowLim = 0
}
if (i + searchRange) <= (r2Length - 1) {
hiLim = i + searchRange
} else {
hiLim = r2Length - 1
}
for j := lowLim; j <= hiLim; j++ {
if !r2Flag[j] && r2[j] == r1[i] {
r2Flag[j] = true
r1Flag[i] = true
commonChars++
break
}
}
}
// if we have nothing in common at this point, nothing else can be done
if commonChars == 0 {
return
}
// otherwise we count the transpositions
k = 0
transCount = 0
for i, _ := range r1 {
if r1Flag[i] {
for j = k; j < r2Length; j++ {
if r2Flag[j] {
k = j + 1
break
}
}
if r1[i] != r2[j] {
transCount++
}
}
}
transCount /= 2
// adjust for similarities in nonmatched characters
distance = float64(commonChars)/float64(r1Length) +
float64(commonChars)/float64(r2Length) +
(float64(commonChars-transCount))/float64(commonChars)
distance /= 3.0
// give more weight to already-similar strings
if winklerize && distance > 0.7 {
// the first 4 characters in common
if minLength >= 4 {
j = 4
} else {
j = minLength
}
for i = 0; i < j && len(r1) > i && len(r2) > i && r1[i] == r2[i] && nan(r1[i]); i++ {
}
if i > 0 {
distance += float64(i) * 0.1 * (1.0 - distance)
}
if longTolerance && (minLength > 4) && (commonChars > i+1) &&
(2*commonChars >= minLength+i) {
if nan(r1[0]) {
distance += (1.0 - distance) * (float64(commonChars-i-1) /
(float64(r1Length) + float64(r2Length) - float64(i*2) + 2))
}
}
}
return
}
// Jaro computes the Jaro edit distance between two strings. It represents
// this with a float64 between 0 and 1 inclusive, with 0 indicating the two
// strings are not at all similar and 1 indicating the two strings are exact
// matches.
//
// See http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance for a
// full description.
func Jaro(r1 string, r2 string) (distance float64) {
return jaroWinklerBase(r1, r2, false, false)
}
// JaroWinkler computes the Jaro-Winkler edit distance between two strings.
// This is a modification of the Jaro algorithm that gives additional weight
// to prefix matches.
func JaroWinkler(r1 string, r2 string, longTolerance bool) (distance float64) {
return jaroWinklerBase(r1, r2, longTolerance, true)
}
================================================
FILE: jarowinkler_test.go
================================================
package matchr
import "testing"
var jarotests = []struct {
s1 string
s2 string
dist float64
}{
{"", "cars", 0.0},
{"cars", "", 0.0},
{"car", "cars", 0.9166666666666666},
{"dixon", "dicksonx", 0.7666666666666666},
{"martha", "marhta", 0.9444444444444445},
{"dwayne", "duane", 0.8222222222222223},
{"martüa", "marüta", 0.9444444444444445},
{"dr", "driveway", 0.75},
}
// Regular Jaro distance
func TestJaro(t *testing.T) {
for _, tt := range jarotests {
dist := Jaro(tt.s1, tt.s2)
if dist != tt.dist {
t.Errorf("Jaro('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
}
}
}
var jarowtests = []struct {
s1 string
s2 string
dist float64
}{
{"", "cars", 0.0},
{"cars", "", 0.0},
{"dixon", "dicksonx", 0.8133333333333332},
{"martha", "marhta", 0.9611111111111111},
{"dwayne", "duane", 0.8400000000000001},
{"dr", "driveway", 0.8},
}
// Jaro-Winkler distance
func TestJaroWinkler(t *testing.T) {
for _, tt := range jarowtests {
dist := JaroWinkler(tt.s1, tt.s2, false)
if dist != tt.dist {
t.Errorf("JaroWinkler('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
}
}
}
================================================
FILE: levenshtein.go
================================================
package matchr
// Levenshtein computes the Levenshtein distance between two
// strings. The returned value - distance - is the number of insertions,
// deletions, and substitutions it takes to transform one
// string (s1) into another (s2). Each step in the transformation "costs"
// one distance point.
func Levenshtein(s1 string, s2 string) (distance int) {
// index by code point, not byte
r1 := []rune(s1)
r2 := []rune(s2)
rows := len(r1) + 1
cols := len(r2) + 1
var d1 int
var d2 int
var d3 int
var i int
var j int
dist := make([]int, rows*cols)
for i = 0; i < rows; i++ {
dist[i*cols] = i
}
for j = 0; j < cols; j++ {
dist[j] = j
}
for j = 1; j < cols; j++ {
for i = 1; i < rows; i++ {
if r1[i-1] == r2[j-1] {
dist[(i*cols)+j] = dist[((i-1)*cols)+(j-1)]
} else {
d1 = dist[((i-1)*cols)+j] + 1
d2 = dist[(i*cols)+(j-1)] + 1
d3 = dist[((i-1)*cols)+(j-1)] + 1
dist[(i*cols)+j] = min(d1, min(d2, d3))
}
}
}
distance = dist[(cols*rows)-1]
return
}
================================================
FILE: levenshtein_test.go
================================================
package matchr
import "testing"
var levtests = []struct {
s1 string
s2 string
dist int
}{
// insertion
{"car", "cars", 1},
// substitution
{"library", "librari", 1},
// deletion
{"library", "librar", 1},
// one empty, left
{"", "library", 7},
// one empty, right
{"library", "", 7},
// two empties
{"", "", 0},
// unicode stuff!
{"Schüßler", "Schübler", 1},
{"Schüßler", "Schußler", 1},
{"Schüßler", "Schüßler", 0},
{"Schüßler", "Schüler", 1},
{"Schüßler", "Schüßlers", 1},
}
// Regular Levenshtein
func TestLevenshtein(t *testing.T) {
for _, tt := range levtests {
dist := Levenshtein(tt.s1, tt.s2)
if dist != tt.dist {
t.Errorf("Levenshtein('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
}
}
}
================================================
FILE: longestcommonsubsequence.go
================================================
package matchr
// LongestCommonSubsequence computes the longest substring
// between two strings. The returned value is the length
// of the substring, which contains letters from both
// strings, while maintaining the order of the letters.
func LongestCommonSubsequence(s1, s2 string) int {
r1 := []rune(s1)
r2 := []rune(s2)
table := make([][]int, len(s1)+1)
// Construct 2D table
for i := range table {
table[i] = make([]int, len(s2)+1)
}
var i int
var j int
for i = len(r1) - 1; i >= 0; i-- {
for j = len(r2) - 1; j >= 0; j-- {
if r1[i] == r2[j] {
table[i][j] = 1 + table[i+1][j+1]
} else {
table[i][j] = maxI(table[i+1][j], table[i][j+1])
}
}
}
return table[0][0]
}
================================================
FILE: longestcommonsubsequence_test.go
================================================
package matchr
import "testing"
var lcstests = []struct {
s1 string
s2 string
length int
}{
// match beginning
{"cans", "can", 3},
// match end
{"ebay", "bay", 3},
// gap in the middle
{"coins", "cons", 4},
// one empty, left
{"", "hello", 0},
// one empty, right
{"goodbye", "", 0},
// two empties
{"", "", 0},
// unicode stuff!
{"Schüßler", "Schüßler", 8},
}
func TestLongestCommonSubsequence(t *testing.T) {
for _, tt := range lcstests {
length := LongestCommonSubsequence(tt.s1, tt.s2)
if length != tt.length {
t.Errorf("LongestCommonSubsequence('%s', '%s') = %v, want %v", tt.s1, tt.s2, length, tt.length)
}
}
}
================================================
FILE: metaphone.go
================================================
package matchr
import (
"bytes"
"strings"
)
type metaphoneresult struct {
// the maximum number of code values to calculate
maxLength int
// whether to calculate an alternate
calcAlternate bool
// no direct modifications - only through add()
primary bytes.Buffer
alternate bytes.Buffer
// length of the private buffers
PrimaryLength int
AlternateLength int
}
func newMetaphoneresult(maxLength int, calcAlternate bool) (r *metaphoneresult) {
r = &metaphoneresult{maxLength: maxLength, calcAlternate: calcAlternate}
return
}
func (r *metaphoneresult) add(c1 string, c2 string) {
if c1 != "" {
r.primary.WriteString(c1)
r.PrimaryLength += len(c1)
}
if c2 != "" && r.calcAlternate {
r.alternate.WriteString(c2)
r.AlternateLength += len(c2)
}
}
func (r *metaphoneresult) isComplete() bool {
return r.PrimaryLength >= r.maxLength && r.AlternateLength >= r.maxLength
}
func (r *metaphoneresult) result() (primary string, alternate string) {
primary = r.primary.String()
if len(primary) > r.maxLength {
primary = primary[0:r.maxLength]
}
alternate = r.alternate.String()
if len(alternate) > r.maxLength {
alternate = alternate[0:r.maxLength]
}
return
}
// utility functions for checking things within a string
func isSlavoGermanic(value string) bool {
return strings.Contains(value, "W") || strings.Contains(value, "K") ||
strings.Contains(value, "CZ") || strings.Contains(value, "WITZ")
}
func isSilentStart(input runestring) bool {
SILENT_START := [...]string{"GN", "KN", "PN", "WR", "PS"}
prefix := input.SafeSubstr(0, 2)
for _, criteria := range SILENT_START {
if prefix == criteria {
return true
}
}
return false
}
func handleVowel(result *metaphoneresult, index int) int {
if index == 0 {
result.add("A", "A")
}
return index + 1
}
/******************************************************************************
* Entry handlers for letters.
*****************************************************************************/
func handleC(input runestring, result *metaphoneresult, index int) int {
if conditionC0(input, index) {
result.add("K", "K")
index += 2
} else if index == 0 && input.Contains(index, 6, "CAESAR") {
result.add("S", "S")
index += 2
} else if input.Contains(index, 2, "CH") {
index = handleCH(input, result, index)
} else if input.Contains(index, 2, "CZ") &&
!input.Contains(index-2, 4, "WICZ") {
result.add("S", "X")
index += 2
} else if input.Contains(index+1, 3, "CIA") {
result.add("X", "X")
index += 3
} else if input.Contains(index, 2, "CC") &&
!(index == 1 && input.SafeAt(0) == 'M') {
return handleCC(input, result, index)
} else if input.Contains(index, 2, "CK") ||
input.Contains(index, 2, "CG") ||
input.Contains(index, 2, "CQ") {
result.add("K", "K")
index += 2
} else if input.Contains(index, 2, "CI") ||
input.Contains(index, 2, "CE") ||
input.Contains(index, 2, "CY") {
if input.Contains(index, 3, "CIO") ||
input.Contains(index, 3, "CIE") ||
input.Contains(index, 3, "CIA") {
result.add("S", "X")
} else {
result.add("S", "S")
}
index += 2
} else {
result.add("K", "K")
if input.Contains(index+1, 2, " C") ||
input.Contains(index+1, 2, " Q") ||
input.Contains(index+1, 2, " G") {
index += 3
} else if (input.Contains(index+1, 1, "C") ||
input.Contains(index+1, 1, "K") ||
input.Contains(index+1, 1, "Q")) &&
!(input.Contains(index+1, 2, "CE") ||
input.Contains(index+1, 2, "CI")) {
index += 2
} else {
index++
}
}
return index
}
func handleCC(input runestring, result *metaphoneresult, index int) int {
if input.Contains(index+2, 1, "I", "E", "H") &&
!input.Contains(index+2, 2, "HU") {
if (index == 1 && input.SafeAt(index-1) == 'A') ||
(input.Contains(index-1, 5, "UCCEE", "UCCES")) {
result.add("KS", "KS")
} else {
result.add("X", "X")
}
index += 3
} else {
result.add("K", "K")
index += 2
}
return index
}
func handleCH(input runestring, result *metaphoneresult, index int) int {
if index > 0 && input.Contains(index, 4, "CHAE") {
result.add("K", "X")
return index + 2
} else if conditionCH0(input, index) {
result.add("K", "K")
return index + 2
// TODO: combine this condition with the one above?
} else if conditionCH1(input, index) {
result.add("K", "K")
return index + 2
} else {
if index > 0 {
if input.Contains(0, 2, "MC") {
result.add("K", "K")
} else {
result.add("X", "K")
}
} else {
result.add("X", "X")
}
return index + 2
}
}
func handleD(input runestring, result *metaphoneresult, index int) int {
if input.Contains(index, 2, "DG") {
if input.Contains(index+2, 1, "I", "E", "Y") {
result.add("J", "J")
index += 3
} else {
result.add("TK", "TK")
index += 2
}
} else if input.Contains(index, 2, "DT", "DD") {
result.add("T", "T")
index += 2
} else {
result.add("T", "T")
index++
}
return index
}
func handleG(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if input.SafeAt(index+1) == 'H' {
index = handleGH(input, result, index)
} else if input.SafeAt(index+1) == 'N' {
if index == 1 && isVowel(input.SafeAt(0)) && !slavoGermanic {
result.add("KN", "N")
} else if !input.Contains(index+2, 2, "EY") && input.SafeAt(index+1) != 'Y' && !slavoGermanic {
result.add("N", "KN")
} else {
result.add("KN", "KN")
}
index += 2
} else if input.Contains(index+1, 2, "LI") && !slavoGermanic {
result.add("KL", "L")
index += 2
} else if index == 0 && (input.SafeAt(index+1) == 'Y' ||
input.Contains(index+1, 2, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER")) {
result.add("K", "J")
index += 2
} else if (input.Contains(index+1, 2, "ER") ||
input.SafeAt(index+1) == 'Y') &&
!input.Contains(0, 6, "DANGER", "RANGER", "MANGER") &&
!input.Contains(index-1, 1, "E", "I") &&
!input.Contains(index-1, 3, "RGY", "OGY") {
result.add("K", "J")
index += 2
} else if input.Contains(index+1, 1, "E", "I", "Y") ||
input.Contains(index-1, 4, "AGGI", "OGGI") {
if input.Contains(0, 4, "VAN ", "VON ") ||
input.Contains(0, 3, "SCH") ||
input.Contains(index+1, 2, "ET") {
result.add("K", "K")
} else if input.Contains(index+1, 3, "IER") {
result.add("J", "J")
} else {
result.add("J", "K")
}
index += 2
} else if input.SafeAt(index+1) == 'G' {
result.add("K", "K")
index += 2
} else {
result.add("K", "K")
index++
}
return index
}
func handleGH(input runestring, result *metaphoneresult, index int) int {
if index > 0 && !isVowel(input.SafeAt(index-1)) {
result.add("K", "K")
index += 2
} else if index == 0 {
if input.SafeAt(index+2) == 'I' {
result.add("J", "J")
} else {
result.add("K", "K")
}
index += 2
} else if (index > 1 && input.Contains(index-2, 1, "B", "H", "D")) ||
(index > 2 && input.Contains(index-3, 1, "B", "H", "D")) ||
(index > 3 && input.Contains(index-4, 1, "B", "H")) {
index += 2
} else {
if index > 2 && input.SafeAt(index-1) == 'U' &&
input.Contains(index-3, 1, "C", "G", "L", "R", "T") {
result.add("F", "F")
} else if index > 0 && input.SafeAt(index-1) != 'I' {
result.add("K", "K")
}
index += 2
}
return index
}
func handleH(input runestring, result *metaphoneresult, index int) int {
if (index == 0 || isVowel(input.SafeAt(index-1))) &&
isVowel(input.SafeAt(index+1)) {
result.add("H", "H")
index += 2
} else {
index++
}
return index
}
func handleJ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if input.Contains(index, 4, "JOSE") || input.Contains(0, 4, "SAN ") {
if (index == 0 && (input.SafeAt(index+4) == ' ') ||
len(input) == 4) || input.Contains(0, 4, "SAN ") {
result.add("H", "H")
} else {
result.add("J", "H")
}
index++
} else {
if index == 0 && !input.Contains(index, 4, "JOSE") {
result.add("J", "A")
} else if isVowel(input.SafeAt(index-1)) && !slavoGermanic &&
(input.SafeAt(index+1) == 'A' || input.SafeAt(index+1) == 'O') {
result.add("J", "H")
} else if index == (len(input) - 1) {
result.add("J", " ")
} else if !input.Contains(index+1, 1,
"L", "T", "K", "S", "N", "M", "B", "Z") &&
!input.Contains(index-1, 1, "S", "K", "L") {
result.add("J", "J")
}
if input.SafeAt(index+1) == 'J' {
index += 2
} else {
index++
}
}
return index
}
func handleL(input runestring, result *metaphoneresult, index int) int {
if input.SafeAt(index+1) == 'L' {
if conditionL0(input, index) {
result.add("L", "")
} else {
result.add("L", "L")
}
index += 2
} else {
result.add("L", "L")
index++
}
return index
}
func handleP(input runestring, result *metaphoneresult, index int) int {
if input.SafeAt(index+1) == 'H' {
result.add("F", "F")
index += 2
} else {
result.add("P", "P")
if input.Contains(index+1, 1, "P", "B") {
index += 2
} else {
index++
}
}
return index
}
func handleR(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if index == (len(input)-1) && !slavoGermanic &&
input.Contains(index-2, 2, "IE") &&
!input.Contains(index-4, 2, "ME", "MA") {
result.add("", "R")
} else {
result.add("R", "R")
}
if input.SafeAt(index+1) == 'R' {
index += 2
} else {
index++
}
return index
}
func handleS(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if input.Contains(index-1, 3, "ISL", "YSL") {
index++
} else if index == 0 && input.Contains(index, 5, "SUGAR") {
result.add("X", "S")
index++
} else if input.Contains(index, 2, "SH") {
if input.Contains(index+1, 4, "HEIM", "HOEK", "HOLM", "HOLZ") {
result.add("S", "S")
} else {
result.add("X", "X")
}
index += 2
} else if input.Contains(index, 3, "SIO", "SIA") ||
input.Contains(index, 4, "SIAN") {
if slavoGermanic {
result.add("S", "S")
} else {
result.add("S", "X")
}
index += 3
} else if (index == 0 && input.Contains(index+1, 1, "M", "N", "L", "W")) ||
input.Contains(index+1, 1, "Z") {
result.add("S", "X")
if input.Contains(index+1, 1, "Z") {
index += 2
} else {
index++
}
} else if input.Contains(index, 2, "SC") {
index = handleSC(input, result, index)
} else {
if index == len(input)-1 &&
input.Contains(index-2, 2, "AI", "OI") {
result.add("", "S")
} else {
result.add("S", "S")
}
if input.Contains(index+1, 1, "S", "Z") {
index += 2
} else {
index++
}
}
return index
}
func handleSC(input runestring, result *metaphoneresult, index int) int {
if input.SafeAt(index+2) == 'H' {
if input.Contains(index+3, 2, "OO", "ER", "EN", "UY", "ED", "EM") {
if input.Contains(index+3, 2, "ER", "EN") {
result.add("X", "SK")
} else {
result.add("SK", "SK")
}
} else {
if index == 0 && !isVowel(input.SafeAt(3)) && input.SafeAt(3) != 'W' {
result.add("X", "S")
} else {
result.add("X", "X")
}
}
} else if input.Contains(index+2, 1, "I", "E", "Y") {
result.add("S", "S")
} else {
result.add("SK", "SK")
}
index += 3
return index
}
func handleT(input runestring, result *metaphoneresult, index int) int {
if input.Contains(index, 4, "TION") {
result.add("X", "X")
index += 3
} else if input.Contains(index, 3, "TIA", "TCH") {
result.add("X", "X")
index += 3
} else if input.Contains(index, 2, "TH") || input.Contains(index, 3, "TTH") {
if input.Contains(index+2, 2, "OM", "AM") ||
input.Contains(0, 4, "VAN ", "VON ") ||
input.Contains(0, 3, "SCH") {
result.add("T", "T")
} else {
result.add("0", "T")
}
index += 2
} else {
result.add("T", "T")
if input.Contains(index+1, 1, "T", "D") {
index += 2
} else {
index++
}
}
return index
}
func handleW(input runestring, result *metaphoneresult, index int) int {
if input.Contains(index, 2, "WR") {
result.add("R", "R")
index += 2
} else {
if index == 0 && (isVowel(input.SafeAt(index+1)) ||
input.Contains(index, 2, "WH")) {
if isVowel(input.SafeAt(index + 1)) {
result.add("A", "F")
} else {
result.add("A", "A")
}
index++
} else if (index == len(input)-1 && isVowel(input.SafeAt(index-1))) ||
input.Contains(index-1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
input.Contains(0, 3, "SCH") {
result.add("", "F")
index++
} else if input.Contains(index, 4, "WICZ", "WITZ") {
result.add("TS", "FX")
index += 4
} else {
index++
}
}
return index
}
func handleX(input runestring, result *metaphoneresult, index int) int {
if index == 0 {
result.add("S", "S")
index++
} else {
if !((index == len(input)-1) &&
(input.Contains(index-3, 3, "IAU", "EAU") ||
input.Contains(index-2, 2, "AU", "OU"))) {
result.add("KS", "KS")
}
if input.Contains(index+1, 1, "C", "X") {
index += 2
} else {
index++
}
}
return index
}
func handleZ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
if input.SafeAt(index+1) == 'H' {
result.add("J", "J")
} else {
if input.Contains(index+1, 2, "ZO", "ZI", "ZA") ||
(slavoGermanic && (index > 0 && input.SafeAt(index-1) != 'T')) {
result.add("S", "TS")
} else {
result.add("S", "S")
}
}
if input.SafeAt(index+1) == 'Z' {
index += 2
} else {
index++
}
return index
}
/******************************************************************************
* Complex conditional handlers for letters
*****************************************************************************/
func conditionC0(input runestring, index int) bool {
if input.Contains(index, 4, "CHIA") {
return true
} else if index <= 1 {
return false
} else if isVowel(input.SafeAt(index - 2)) {
return false
} else if !input.Contains(index-1, 3, "ACH") {
return false
} else {
c := input.SafeAt(index + 2)
return (c != 'I' && c != 'E') ||
(input.Contains(index-2, 6, "BACHER") ||
input.Contains(index-2, 6, "MACHER"))
}
}
func conditionCH0(input runestring, index int) bool {
if index != 0 {
return false
} else if !input.Contains(index+1, 5, "HARAC", "HARIS") &&
!input.Contains(index+1, 3, "HOR", "HYM", "HIA", "HEM") {
return false
} else if input.Contains(0, 5, "CHORE") {
return false
} else {
return true
}
}
func conditionCH1(input runestring, index int) bool {
// good god this is ugly
return (input.Contains(0, 4, "VAN ", "VON ") || input.Contains(0, 3, "SCH")) ||
input.Contains(index-2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
input.Contains(index+2, 1, "T", "S") ||
((input.Contains(index-1, 1, "A", "O", "U", "E") || index == 0) &&
(input.Contains(index+2, 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ") ||
index+1 == len(input)-1))
}
func conditionL0(input runestring, index int) bool {
if index == (len(input)-3) &&
input.Contains(index-1, 4, "ILLO", "ILLA", "ALLE") {
return true
} else if (input.Contains(len(input)-2, 2, "AS", "OS") ||
input.Contains(len(input)-1, 1, "A", "O")) &&
(input.Contains(index-1, 4, "ALLE")) {
return true
} else {
return false
}
}
func conditionM0(input runestring, index int) bool {
if input.SafeAt(index+1) == 'M' {
return true
}
return input.Contains(index-1, 3, "UMB") &&
((index+1) == (len(input)-1) ||
input.Contains(index+2, 2, "ER"))
}
// DoubleMetaphone computes the Double-Metaphone value of the input string.
// This value is a phonetic representation of how the string sounds, with
// affordances for many different language dialects. It was originally
// developed by Lawrence Phillips in the 1990s.
//
// More information about this algorithm can be found on Wikipedia at
// http://en.wikipedia.org/wiki/Metaphone.
func DoubleMetaphone(s1 string) (string, string) {
// trim, upper space
s1 = cleanInput(s1)
// structure to traverse the string by code point, not byte
input := runestring(s1)
slavoGermanic := isSlavoGermanic(s1)
// where we are in the string
index := 0
if isSilentStart(input) {
index += 1
}
result := newMetaphoneresult(4, true)
for !result.isComplete() && index <= len(input)-1 {
c := rune(input.SafeAt(index))
switch c {
case 'A', 'E', 'I', 'O', 'U', 'Y':
index = handleVowel(result, index)
case 'B':
result.add("P", "P")
if input.SafeAt(index+1) == 'B' {
index += 2
} else {
index++
}
case 'Ç':
result.add("S", "S")
index++
case 'C':
index = handleC(input, result, index)
case 'D':
index = handleD(input, result, index)
case 'F':
result.add("F", "F")
if input.SafeAt(index+1) == 'F' {
index += 2
} else {
index++
}
case 'G':
index = handleG(input, result, index, slavoGermanic)
case 'H':
index = handleH(input, result, index)
case 'J':
index = handleJ(input, result, index, slavoGermanic)
case 'K':
result.add("K", "K")
if input.SafeAt(index+1) == 'K' {
index += 2
} else {
index++
}
case 'L':
index = handleL(input, result, index)
case 'M':
result.add("M", "M")
if conditionM0(input, index) {
index += 2
} else {
index++
}
case 'N':
result.add("N", "N")
if input.SafeAt(index+1) == 'N' {
index += 2
} else {
index++
}
case 'Ñ':
result.add("N", "N")
index++
case 'P':
index = handleP(input, result, index)
case 'Q':
result.add("K", "K")
if input.SafeAt(index+1) == 'Q' {
index += 2
} else {
index++
}
case 'R':
index = handleR(input, result, index, slavoGermanic)
case 'S':
index = handleS(input, result, index, slavoGermanic)
case 'T':
index = handleT(input, result, index)
case 'V':
result.add("F", "F")
if input.SafeAt(index+1) == 'V' {
index += 2
} else {
index++
}
case 'W':
index = handleW(input, result, index)
case 'X':
index = handleX(input, result, index)
case 'Z':
index = handleZ(input, result, index, slavoGermanic)
default:
index++
}
}
return result.result()
}
================================================
FILE: metaphone_test.go
================================================
package matchr
import (
"bufio"
"compress/gzip"
"os"
"strings"
"testing"
)
func TestDoubleMetaphone(t *testing.T) {
// load gzipped corpus
f, err := os.Open("double_metaphone_corpus.txt.gz")
if err != nil {
panic("Error opening file double_metaphone_corpus.txt.gz! Exiting.")
}
defer f.Close()
g, err := gzip.NewReader(f)
if err != nil {
panic("Error with supposedly gzipped file double_metaphone_corpus.txt.gz! Exiting.")
}
r := bufio.NewReader(g)
line, err := r.ReadString('\n')
for err == nil {
line = strings.TrimRight(line, "\n")
v := strings.Split(line, "|")
metaphone, alternate := DoubleMetaphone(v[0])
if metaphone != v[1] || alternate != v[2] {
t.Errorf("DoubleMetaphone('%s') = (%v, %v), want (%v, %v)", v[0], metaphone, alternate, v[1], v[2])
t.FailNow()
}
line, err = r.ReadString('\n')
}
}
================================================
FILE: nysiis.go
================================================
package matchr
// NYSIIS computes the NYSIIS phonetic encoding of the input string. It is a
// modification of the traditional Soundex algorithm.
func NYSIIS(s1 string) string {
cleans1 := runestring(cleanInput(s1))
input := runestring(make([]rune, 0, len(s1)))
// The output can't be larger than the string itself
output := runestring(make([]rune, 0, len(s1)))
// 0. Remove all non-ASCII characters
for _, v := range cleans1 {
if v >= 65 && v <= 90 {
input = append(input, v)
}
}
if len(input) == 0 {
return ""
}
// 1. Transcoding first characters
switch input[0] {
case 'M':
if input.SafeSubstr(0, 3) == "MAC" {
// MAC -> MCC
input[1] = 'C'
}
case 'K':
if input.SafeSubstr(0, 2) == "KN" {
// KN -> NN
input[0] = 'N'
} else {
// K -> C
input[0] = 'C'
}
case 'P':
next := input.SafeAt(1)
if next == 'H' {
// PH -> FF
input[0] = 'F'
input[1] = 'F'
} else if next == 'F' {
// PF -> FF
input[0] = 'F'
}
case 'S':
if input.SafeSubstr(0, 3) == "SCH" {
input[1] = 'S'
input[2] = 'S'
}
}
// 2. Transcoding last characters
switch input.SafeSubstr(len(input)-2, 2) {
case "EE", "IE":
// EE, IE -> Y
input.Del(len(input) - 2)
input[len(input)-1] = 'Y'
case "DT", "RT", "RD", "NT", "ND":
// DT, RT, RD, NT, ND -> D
input.Del(len(input) - 2)
input[len(input)-1] = 'D'
}
// 3. First character of key = first character of name
output = append(output, input[0])
last := input[0]
for i := 1; i < len(input); i++ {
c := input[i]
switch c {
case 'A', 'I', 'O', 'U':
// A, E, I, O, U -> A (E is separate)
input[i] = 'A'
case 'E':
// EV -> AF, else A
if input.SafeAt(i+1) == 'V' {
input[i+1] = 'F'
}
input[i] = 'A'
case 'Q':
// Q -> G
input[i] = 'G'
case 'Z':
// Z -> S
input[i] = 'S'
case 'M':
// M -> N
input[i] = 'N'
case 'K':
// KN -> N, else K -> C
if input.SafeAt(i+1) == 'N' {
input.Del(i)
} else {
input[i] = 'C'
}
case 'S':
// SCH -> SSS
if input.SafeSubstr(i, 3) == "SCH" {
input[i+1] = 'S'
input[i+2] = 'S'
}
case 'P':
// PH -> FF
if input.SafeAt(i+1) == 'H' {
input[i] = 'F'
input[i+1] = 'F'
}
case 'H':
// H -> $(previous character) if previous character or
// next character is a non-vowel
prev := input.SafeAt(i - 1)
next := input.SafeAt(i + 1)
if !isVowelNoY(prev) || !isVowelNoY(next) {
input[i] = prev
}
case 'W':
prev := input.SafeAt(i - 1)
if isVowelNoY(prev) {
input[i] = prev
}
}
if input[i] != last && input[i] != 0 {
output = append(output, input[i])
}
last = input[i]
}
// have to be careful here because we've already added the first
// key value
if len(output) > 1 {
// remove trailing s
if output.SafeAt(len(output)-1) == 'S' {
output.Del(len(output) - 1)
}
// trailing AY -> Y
if len(output) > 2 && output.SafeSubstr(len(output)-2, 2) == "AY" {
output.Del(len(output) - 2)
}
// trailing A -> remove it
if output.SafeAt(len(output)-1) == 'A' {
output.Del(len(output) - 1)
}
}
if len(output) > 6 {
return string(output[0:6])
} else {
return string(output)
}
}
================================================
FILE: nysiis_test.go
================================================
package matchr
import "testing"
var nysiistests = []struct {
s1 string
nysiis string
}{
{"knight", "NAGT"},
{"mitchell", "MATCAL"},
{"o'daniel", "ODANAL"},
{"brown sr", "BRANSR"},
{"browne III", "BRAN"},
{"browne IV", "BRANAV"},
{"O'Banion", "OBANAN"},
{"Mclaughlin", "MCLAGL"},
{"McCormack", "MCARNA"},
{"Chapman", "CAPNAN"},
{"Silva", "SALV"},
{"McDonald", "MCDANA"},
{"Lawson", "LASAN"},
{"Jacobs", "JACAB"},
{"Greene", "GRAN"},
{"O'Brien", "OBRAN"},
{"Morrison", "MARASA"},
{"Larson", "LARSAN"},
{"Willis", "WAL"},
{"Mackenzie", "MCANSY"},
{"Carr", "CAR"},
{"Lawrence", "LARANC"},
{"Matthews", "MAT"},
{"Richards", "RACARD"},
{"Bishop", "BASAP"},
{"Franklin", "FRANCL"},
{"McDaniel", "MCDANA"},
{"Harper", "HARPAR"},
{"Lynch", "LYNC"},
{"Watkins", "WATCAN"},
{"Carlson", "CARLSA"},
{"Wheeler", "WALAR"},
{"Louis XVI", "LASXV"},
{"2002", ""},
{"1/2", ""},
{"", ""},
}
// NYSIIS
func TestNYIIS(t *testing.T) {
for _, tt := range nysiistests {
nysiis := NYSIIS(tt.s1)
if nysiis != tt.nysiis {
t.Errorf("NYSIIS('%s') = %v, want %v", tt.s1, nysiis, tt.nysiis)
}
}
}
================================================
FILE: osa.go
================================================
package matchr
// OSA computes the Optimal String Alignment distance between two
// strings. The returned value - distance - is the number of insertions,
// deletions, substitutions, and transpositions it takes to transform one
// string (s1) into another (s2). Each step in the transformation "costs"
// one distance point. It is similar to Damerau-Levenshtein, but is simpler
// because it does not allow multiple edits on any substring.
func OSA(s1 string, s2 string) (distance int) {
// index by code point, not byte
r1 := []rune(s1)
r2 := []rune(s2)
rows := len(r1) + 1
cols := len(r2) + 1
var i, j, d1, d2, d3, d_now, cost int
dist := make([]int, rows*cols)
for i = 0; i < rows; i++ {
dist[i*cols] = i
}
for j = 0; j < cols; j++ {
dist[j] = j
}
for i = 1; i < rows; i++ {
for j = 1; j < cols; j++ {
if r1[i-1] == r2[j-1] {
cost = 0
} else {
cost = 1
}
d1 = dist[((i-1)*cols)+j] + 1
d2 = dist[(i*cols)+(j-1)] + 1
d3 = dist[((i-1)*cols)+(j-1)] + cost
d_now = min(d1, min(d2, d3))
if i > 2 && j > 2 && r1[i-1] == r2[j-2] &&
r1[i-2] == r2[j-1] {
d1 = dist[((i-2)*cols)+(j-2)] + cost
d_now = min(d_now, d1)
}
dist[(i*cols)+j] = d_now
}
}
distance = dist[(cols*rows)-1]
return
}
================================================
FILE: osa_test.go
================================================
package matchr
import "testing"
var osatests = []struct {
s1 string
s2 string
dist int
}{
// insertion
{"car", "cars", 1},
// substitution
{"library", "librari", 1},
// deletion
{"library", "librar", 1},
// transposition
{"library", "librayr", 1},
// one empty, left
{"", "library", 7},
// one empty, right
{"library", "", 7},
// two empties
{"", "", 0},
// unicode stuff!
{"Schüßler", "Schübler", 1},
{"Schüßler", "Schußler", 1},
{"Schüßler", "Schüßler", 0},
{"Schßüler", "Schüßler", 1},
{"Schüßler", "Schüler", 1},
{"Schüßler", "Schüßlers", 1},
// difference between DL and OSA. This is OSA, so it should be 3.
{"ca", "abc", 3},
}
// OSA (Optimal String Alignment)
func TestOSA(t *testing.T) {
for _, tt := range osatests {
dist := OSA(tt.s1, tt.s2)
if dist != tt.dist {
t.Errorf("OSA('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
}
}
}
================================================
FILE: phonex.go
================================================
package matchr
func preProcess(input []rune) []rune {
output := runestring(make([]rune, 0, len(input)))
// 0. Remove all non-ASCII characters
for _, v := range input {
if v >= 65 && v <= 90 {
output = append(output, v)
}
}
// 1. Remove all trailing 'S' characters at the end of the name
for i := len(output) - 1; i >= 0 && output[i] == 'S'; i-- {
output.Del(i)
}
// 2. Convert leading letter pairs as follows
// KN -> N, PH -> F, WR -> R
switch output.SafeSubstr(0, 2) {
case "KN":
output = output[1:]
case "PH":
output[0] = 'F' // H will be ignored anyway
case "WR":
output = output[1:]
}
// 3a. Convert leading single letters as follows:
// H -> Remove
if output.SafeAt(0) == 'H' {
output = output[1:]
}
// 3a. Convert leading single letters as follows:
// E,I,O,U,Y -> A
// P -> B
// V -> F
// K,Q -> C
// J -> G
// Z -> S
switch output.SafeAt(0) {
case 'E', 'I', 'O', 'U', 'Y':
output[0] = 'A'
case 'P':
output[0] = 'B'
case 'V':
output[0] = 'F'
case 'K', 'Q':
output[0] = 'C'
case 'J':
output[0] = 'G'
case 'Z':
output[0] = 'S'
}
return output
}
// Phonex computes the Phonex phonetic encoding of the input string. Phonex is
// a modification of the venerable Soundex algorithm. It accounts for a few
// more letter combinations to improve accuracy on some data sets.
//
// This implementation is based off of the original C implementation by the
// creator - A. J. Lait - as found in his research paper entitled "An
// Assessment of Name Matching Algorithms."
func Phonex(s1 string) string {
// preprocess
s1 = cleanInput(s1)
input := runestring(preProcess([]rune(s1)))
result := make([]rune, 0, len(input))
last := rune(0)
code := rune(0)
for i := 0; i < len(input) &&
input[i] != ' ' &&
input[i] != ',' &&
len(result) < 4; i++ {
switch input[i] {
case 'B', 'P', 'F', 'V':
code = '1'
case 'C', 'S', 'K', 'G', 'J', 'Q', 'X', 'Z':
code = '2'
case 'D', 'T':
if input.SafeAt(i+1) != 'C' {
code = '3'
}
case 'L':
if isVowel(input.SafeAt(i+1)) || i == len(input)-1 {
code = '4'
}
case 'M', 'N':
nextChar := input.SafeAt(i + 1)
if nextChar == 'D' || nextChar == 'G' {
// ignore next character
i++
}
code = '5'
case 'R':
if isVowel(input.SafeAt(i+1)) || i == len(input)-1 {
code = '6'
}
default:
code = 0
}
if last != code && code != 0 && i != 0 {
result = append(result, code)
}
// special case for 1st character: we use the actual character
if i == 0 {
result = append(result, input[i])
last = code
} else {
last = result[len(result)-1]
}
}
for len(result) < 4 {
result = append(result, '0')
}
return string(result)
}
================================================
FILE: phonex_test.go
================================================
package matchr
import "testing"
// test cases from http://rosettacode.org/wiki/phonex#F.23
var phonextests = []struct {
s1 string
phonex string
}{
{"123 testsss", "T230"},
{"24/7 test", "T230"},
{"A", "A000"},
{"Lee", "L000"},
{"Kuhne", "C500"},
{"Meyer-Lansky", "M452"},
{"Oepping", "A150"},
{"Daley", "D400"},
{"Dalitz", "D432"},
{"Duhlitz", "D432"},
{"Dull", "D400"},
{"De Ledes", "D430"},
{"Sandemann", "S500"},
{"Schüßler", "S460"},
{"Schmidt", "S530"},
{"Sinatra", "S536"},
{"Heinrich", "A562"},
{"Hammerschlag", "A524"},
{"Williams", "W450"},
{"Wilms", "W500"},
{"Wilson", "W250"},
{"Worms", "W500"},
{"Zedlitz", "S343"},
{"Zotteldecke", "S320"},
{"ZYX test", "S232"},
{"Scherman", "S500"},
{"Schurman", "S500"},
{"Sherman", "S500"},
{"Shermansss", "S500"},
{"Shireman", "S650"},
{"Shurman", "S500"},
{"Euler", "A460"},
{"Ellery", "A460"},
{"Hilbert", "A130"},
{"Heilbronn", "A165"},
{"Gauss", "G000"},
{"Ghosh", "G200"},
{"Knuth", "N300"},
{"Kant", "C530"},
{"Lloyd", "L430"},
{"Ladd", "L300"},
{"Lukasiewicz", "L200"},
{"Lissajous", "L200"},
{"Ashcraft", "A261"},
{"Philip", "F410"},
{"Fripp", "F610"},
{"Czarkowska", "C200"},
{"Hornblower", "A514"},
{"Looser", "L260"},
{"Wright", "R230"},
{"Phonic", "F520"},
{"Quickening", "C250"},
{"Kuickening", "C250"},
{"Joben", "G150"},
{"Zelda", "S300"},
{"S", "0000"},
{"H", "0000"},
{"", "0000"},
}
// phonex
func TestPhonex(t *testing.T) {
for _, tt := range phonextests {
phonex := Phonex(tt.s1)
if phonex != tt.phonex {
t.Errorf("Phonex('%s') = %v, want %v", tt.s1, phonex, tt.phonex)
}
}
}
================================================
FILE: runestring.go
================================================
package matchr
type runestring []rune
// A safe way to index a runestring. It will return a null rune if you try
// to index outside of the bounds of the runestring.
func (r *runestring) SafeAt(pos int) rune {
if pos < 0 || pos >= len(*r) {
return 0
} else {
return (*r)[pos]
}
}
// A safe way to obtain a substring of a runestring. It will return a null
// string ("") if you index somewhere outside its bounds.
func (r *runestring) SafeSubstr(pos int, length int) string {
if pos < 0 || pos > len(*r) || (pos+length) > len(*r) {
return ""
} else {
return string((*r)[pos : pos+length])
}
}
// Delete characters at positions pos. It will do nothing if you provide
// an index outside the bounds of the runestring.
func (r *runestring) Del(pos ...int) {
for _, i := range pos {
if i >= 0 && i <= len(*r) {
*r = append((*r)[:i], (*r)[i+1:]...)
}
}
}
// A helper to determine if any substrings exist within the given runestring.
func (r *runestring) Contains(start int, length int, criteria ...string) bool {
substring := r.SafeSubstr(start, length)
for _, c := range criteria {
if substring == c {
return true
}
}
return false
}
================================================
FILE: smithwaterman.go
================================================
package matchr
const GAP_COST = float64(0.5)
func getCost(r1 []rune, r1Index int, r2 []rune, r2Index int) float64 {
if r1[r1Index] == r2[r2Index] {
return 1.0
} else {
return -2.0
}
}
// SmithWaterman computes the Smith-Waterman local sequence alignment for the
// two input strings. This was originally designed to find similar regions in
// strings representing DNA or protein sequences.
func SmithWaterman(s1 string, s2 string) float64 {
var cost float64
// index by code point, not byte
r1 := []rune(s1)
r2 := []rune(s2)
r1Len := len(r1)
r2Len := len(r2)
if r1Len == 0 {
return float64(r2Len)
}
if r2Len == 0 {
return float64(r1Len)
}
d := make([][]float64, r1Len)
for i := range d {
d[i] = make([]float64, r2Len)
}
var maxSoFar float64
for i := 0; i < r1Len; i++ {
// substitution cost
cost = getCost(r1, i, r2, 0)
if i == 0 {
d[0][0] = max(0.0, max(-GAP_COST, cost))
} else {
d[i][0] = max(0.0, max(d[i-1][0]-GAP_COST, cost))
}
// save if it is the biggest thus far
if d[i][0] > maxSoFar {
maxSoFar = d[i][0]
}
}
for j := 0; j < r2Len; j++ {
// substitution cost
cost = getCost(r1, 0, r2, j)
if j == 0 {
d[0][0] = max(0, max(-GAP_COST, cost))
} else {
d[0][j] = max(0, max(d[0][j-1]-GAP_COST, cost))
}
// save if it is the biggest thus far
if d[0][j] > maxSoFar {
maxSoFar = d[0][j]
}
}
for i := 1; i < r1Len; i++ {
for j := 1; j < r2Len; j++ {
cost = getCost(r1, i, r2, j)
// find the lowest cost
d[i][j] = max(
max(0, d[i-1][j]-GAP_COST),
max(d[i][j-1]-GAP_COST, d[i-1][j-1]+cost))
// save if it is the biggest thus far
if d[i][j] > maxSoFar {
maxSoFar = d[i][j]
}
}
}
return maxSoFar
}
================================================
FILE: smithwaterman_test.go
================================================
package matchr
import "testing"
var swtests = []struct {
s1 string
s2 string
dist float64
}{
// insertion
{"car", "cars", 3.0},
// substitution
{"library", "librari", 6.0},
// deletion
{"library", "librar", 6.0},
// transposition
{"library", "librayr", 5.5},
// one empty, left
{"", "library", 7.0},
// one empty, right
{"library", "", 7.0},
// two empties
{"", "", 0.0},
// unicode stuff!
{"Schüßler", "Schübler", 6.0},
{"Ant Zucaro", "Anthony Zucaro", 8.0},
{"Schüßler", "Schüßler", 8.0},
{"Schßüler", "Schüßler", 6.0},
{"Schüßler", "Schüler", 6.5},
{"Schüßler", "Schüßlers", 8.0},
}
// Smith-Waterman
func TestSmithWaterman(t *testing.T) {
for _, tt := range swtests {
dist := SmithWaterman(tt.s1, tt.s2)
if dist != tt.dist {
t.Errorf("SmithWaterman('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
}
}
}
================================================
FILE: soundex.go
================================================
package matchr
import "strings"
// Soundex computes the Soundex phonetic representation of the input string. It
// attempts to encode homophones with the same characters. More information can
// be found at http://en.wikipedia.org/wiki/Soundex.
func Soundex(s1 string) string {
if len(s1) == 0 {
return ""
}
// we should work with all uppercase
s1 = strings.ToUpper(s1)
input := NewString(s1)
// the encoded value
enc := input.Slice(0, 1)
c := ""
prev := ""
hw := false
for i := 0; i < input.RuneCount(); i++ {
switch rune(input.At(i)) {
case 'B', 'F', 'P', 'V':
c = "1"
case 'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z':
c = "2"
case 'D', 'T':
c = "3"
case 'L':
c = "4"
case 'M', 'N':
c = "5"
case 'R':
c = "6"
case 'H', 'W':
hw = true
default:
c = ""
}
// don't encode the first position, but we need its code value
// to prevent repeats
if c != "" && c != prev && i > 0 {
// if the next encoded digit is different, we can add it right away
// if it is the same, though, it must not have been preceded
// by an 'H' or a 'W'
if enc[len(enc)-1:len(enc)] != c || !hw {
enc = enc + c
}
// we're done when we reach four encoded characters
if len(enc) == 4 {
break
}
}
prev = c
hw = false
}
// if we've fallen short of 4 "real" encoded characters,
// it gets padded with zeros
for len(enc) < 4 {
enc = enc + "0"
}
return enc
}
================================================
FILE: soundex_test.go
================================================
package matchr
import "testing"
// test cases from http://rosettacode.org/wiki/Soundex#F.23
var soundextests = []struct {
s1 string
soundex string
}{
{"Ashcraft", "A261"},
{"Ashhhcraft", "A261"},
{"Ashcroft", "A261"},
{"Burroughs", "B620"},
{"Burrows", "B620"},
{"Ekzampul", "E251"},
{"Example", "E251"},
{"Ellery", "E460"},
{"Euler", "E460"},
{"Ghosh", "G200"},
{"Gauss", "G200"},
{"Gutierrez", "G362"},
{"Heilbronn", "H416"},
{"Hilbert", "H416"},
{"Jackson", "J250"},
{"Kant", "K530"},
{"Knuth", "K530"},
{"Lee", "L000"},
{"Lukasiewicz", "L222"},
{"Lissajous", "L222"},
{"Ladd", "L300"},
{"Lloyd", "L300"},
{"Moses", "M220"},
{"O'Hara", "O600"},
{"Pfister", "P236"},
{"Rubin", "R150"},
{"Robert", "R163"},
{"Rupert", "R163"},
{"Soundex", "S532"},
{"Sownteks", "S532"},
{"Tymczak", "T522"},
{"VanDeusen", "V532"},
{"Washington", "W252"},
{"Wheaton", "W350"},
}
// Soundex
func TestSoundex(t *testing.T) {
for _, tt := range soundextests {
soundex := Soundex(tt.s1)
if soundex != tt.soundex {
t.Errorf("Soundex('%s') = %v, want %v", tt.s1, soundex, tt.soundex)
}
}
}
================================================
FILE: utf8.go
================================================
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package matchr
import (
"errors"
"unicode/utf8"
)
// String wraps a regular string with a small structure that provides more
// efficient indexing by code point index, as opposed to byte index.
// Scanning incrementally forwards or backwards is O(1) per index operation
// (although not as fast a range clause going forwards). Random access is
// O(N) in the length of the string, but the overhead is less than always
// scanning from the beginning.
// If the string is ASCII, random access is O(1).
// Unlike the built-in string type, String has internal mutable state and
// is not thread-safe.
type String struct {
str string
numRunes int
// If width > 0, the rune at runePos starts at bytePos and has the specified width.
width int
bytePos int
runePos int
nonASCII int // byte index of the first non-ASCII rune.
}
// NewString returns a new UTF-8 string with the provided contents.
func NewString(contents string) *String {
return new(String).Init(contents)
}
// Init initializes an existing String to hold the provided contents.
// It returns a pointer to the initialized String.
func (s *String) Init(contents string) *String {
s.str = contents
s.bytePos = 0
s.runePos = 0
for i := 0; i < len(contents); i++ {
if contents[i] >= utf8.RuneSelf {
// Not ASCII.
s.numRunes = utf8.RuneCountInString(contents)
_, s.width = utf8.DecodeRuneInString(contents)
s.nonASCII = i
return s
}
}
// ASCII is simple. Also, the empty string is ASCII.
s.numRunes = len(contents)
s.width = 0
s.nonASCII = len(contents)
return s
}
// String returns the contents of the String. This method also means the
// String is directly printable by fmt.Print.
func (s *String) String() string {
return s.str
}
// RuneCount returns the number of runes (Unicode code points) in the String.
func (s *String) RuneCount() int {
return s.numRunes
}
// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
func (s *String) IsASCII() bool {
return s.width == 0
}
// Slice returns the string sliced at rune positions [i:j].
func (s *String) Slice(i, j int) string {
// ASCII is easy. Let the compiler catch the indexing error if there is one.
if j < s.nonASCII {
return s.str[i:j]
}
if i < 0 || j > s.numRunes || i > j {
panic(errors.New("utf8.String: slice index out of range"))
}
if i == j {
return ""
}
// For non-ASCII, after At(i), bytePos is always the position of the indexed character.
var low, high int
switch {
case i < s.nonASCII:
low = i
case i == s.numRunes:
low = len(s.str)
default:
s.At(i)
low = s.bytePos
}
switch {
case j == s.numRunes:
high = len(s.str)
default:
s.At(j)
high = s.bytePos
}
return s.str[low:high]
}
// At returns the rune with index i in the String. The sequence of runes is the same
// as iterating over the contents with a "for range" clause.
func (s *String) At(i int) int {
// ASCII is easy. Let the compiler catch the indexing error if there is one.
if i < s.nonASCII {
return int(s.str[i])
}
// Now we do need to know the index is valid.
if i < 0 || i >= s.numRunes {
panic(errors.New("utf8.String: index out of range"))
}
var r rune
// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
// With these cases, all scans from beginning or end work in O(1) time per rune.
switch {
case i == s.runePos-1: // backing up one rune
r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
s.runePos = i
s.bytePos -= s.width
return int(r)
case i == s.runePos+1: // moving ahead one rune
s.runePos = i
s.bytePos += s.width
fallthrough
case i == s.runePos:
r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
return int(r)
case i == 0: // start of string
r, s.width = utf8.DecodeRuneInString(s.str)
s.runePos = 0
s.bytePos = 0
return int(r)
case i == s.numRunes-1: // last rune in string
r, s.width = utf8.DecodeLastRuneInString(s.str)
s.runePos = i
s.bytePos = len(s.str) - s.width
return int(r)
}
// We need to do a linear scan. There are three places to start from:
// 1) The beginning
// 2) bytePos/runePos.
// 3) The end
// Choose the closest in rune count, scanning backwards if necessary.
forward := true
if i < s.runePos {
// Between beginning and pos. Which is closer?
// Since both i and runePos are guaranteed >= nonASCII, that's the
// lowest location we need to start from.
if i < (s.runePos-s.nonASCII)/2 {
// Scan forward from beginning
s.bytePos, s.runePos = s.nonASCII, s.nonASCII
} else {
// Scan backwards from where we are
forward = false
}
} else {
// Between pos and end. Which is closer?
if i-s.runePos < (s.numRunes-s.runePos)/2 {
// Scan forward from pos
} else {
// Scan backwards from end
s.bytePos, s.runePos = len(s.str), s.numRunes
forward = false
}
}
if forward {
// TODO: Is it much faster to use a range loop for this scan?
for {
r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
if s.runePos == i {
break
}
s.runePos++
s.bytePos += s.width
}
} else {
for {
r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
s.runePos--
s.bytePos -= s.width
if s.runePos == i {
break
}
}
}
return int(r)
}
// We want the panic in At(i) to satisfy os.Error, because that's what
// runtime panics satisfy, but we can't import os. This is our solution.
// error is the type of the error returned if a user calls String.At(i) with i out of range.
// It satisfies os.Error and runtime.Error.
// type error string
/*
func (err error) String() string {
return string(err)
}
func (err error) RunTimeError() {
}
*/
================================================
FILE: util.go
================================================
package matchr
import (
"math"
"strings"
)
// min of two integers
func min(a int, b int) (res int) {
if a < b {
res = a
} else {
res = b
}
return
}
// max of two integers
func maxI(a int, b int) (res int) {
if a < b {
res = b
} else {
res = a
}
return
}
// max of two float64s
func max(a float64, b float64) (res float64) {
if a < b {
res = b
} else {
res = a
}
return
}
// is this string index outside of the ASCII numeric code points?
func nan(c rune) bool {
return ((c > 57) || (c < 48))
}
// Round a float64 to the given precision
//
// http://play.golang.org/p/S654PxAe_N
//
// (via Rory McGuire at
// https://groups.google.com/forum/#!topic/golang-nuts/ITZV08gAugI)
func round(x float64, prec int) float64 {
if math.IsNaN(x) || math.IsInf(x, 0) {
return x
}
sign := 1.0
if x < 0 {
sign = -1
x *= -1
}
var rounder float64
pow := math.Pow(10, float64(prec))
intermed := x * pow
_, frac := math.Modf(intermed)
if frac >= 0.5 {
rounder = math.Ceil(intermed)
} else {
rounder = math.Floor(intermed)
}
return rounder / pow * sign
}
// A helper to determine if any substrings exist within the given string
func contains(value *String, start int, length int, criteria ...string) bool {
substring := substring(value, start, length)
for _, c := range criteria {
if substring == c {
return true
}
}
return false
}
// A fault-tolerant version of Slice. It will return nothing ("") if the index
// is out of bounds. This allows substring-ing without having to bound check
// every time.
func substring(value *String, start int, length int) string {
if start >= 0 && start+length <= value.RuneCount() {
return value.Slice(start, start+length)
} else {
return ""
}
}
func isVowel(c rune) bool {
switch c {
case 'A', 'E', 'I', 'O', 'U', 'Y':
return true
default:
return false
}
}
func isVowelNoY(c rune) bool {
switch c {
case 'A', 'E', 'I', 'O', 'U':
return true
default:
return false
}
}
func cleanInput(input string) string {
return strings.ToUpper(strings.TrimSpace(input))
}
gitextract_yrde_zzu/ ├── COPYING.txt ├── README.md ├── damerau_levenshtein.go ├── damerau_levenshtein_test.go ├── go.mod ├── hamming.go ├── hamming_test.go ├── jarowinkler.go ├── jarowinkler_test.go ├── levenshtein.go ├── levenshtein_test.go ├── longestcommonsubsequence.go ├── longestcommonsubsequence_test.go ├── metaphone.go ├── metaphone_test.go ├── nysiis.go ├── nysiis_test.go ├── osa.go ├── osa_test.go ├── phonex.go ├── phonex_test.go ├── runestring.go ├── smithwaterman.go ├── smithwaterman_test.go ├── soundex.go ├── soundex_test.go ├── utf8.go └── util.go
SYMBOL INDEX (81 symbols across 25 files)
FILE: damerau_levenshtein.go
function DamerauLevenshtein (line 15) | func DamerauLevenshtein(s1 string, s2 string) (distance int) {
FILE: damerau_levenshtein_test.go
function TestDamerauLevenshtein (line 36) | func TestDamerauLevenshtein(t *testing.T) {
FILE: hamming.go
function Hamming (line 9) | func Hamming(s1 string, s2 string) (distance int, err error) {
FILE: hamming_test.go
function TestHamming (line 22) | func TestHamming(t *testing.T) {
FILE: jarowinkler.go
function jaroWinklerBase (line 3) | func jaroWinklerBase(s1 string, s2 string,
function Jaro (line 126) | func Jaro(r1 string, r2 string) (distance float64) {
function JaroWinkler (line 133) | func JaroWinkler(r1 string, r2 string, longTolerance bool) (distance flo...
FILE: jarowinkler_test.go
function TestJaro (line 21) | func TestJaro(t *testing.T) {
function TestJaroWinkler (line 44) | func TestJaroWinkler(t *testing.T) {
FILE: levenshtein.go
function Levenshtein (line 8) | func Levenshtein(s1 string, s2 string) (distance int) {
FILE: levenshtein_test.go
function TestLevenshtein (line 31) | func TestLevenshtein(t *testing.T) {
FILE: longestcommonsubsequence.go
function LongestCommonSubsequence (line 7) | func LongestCommonSubsequence(s1, s2 string) int {
FILE: longestcommonsubsequence_test.go
function TestLongestCommonSubsequence (line 26) | func TestLongestCommonSubsequence(t *testing.T) {
FILE: metaphone.go
type metaphoneresult (line 8) | type metaphoneresult struct
method add (line 29) | func (r *metaphoneresult) add(c1 string, c2 string) {
method isComplete (line 41) | func (r *metaphoneresult) isComplete() bool {
method result (line 45) | func (r *metaphoneresult) result() (primary string, alternate string) {
function newMetaphoneresult (line 24) | func newMetaphoneresult(maxLength int, calcAlternate bool) (r *metaphone...
function isSlavoGermanic (line 58) | func isSlavoGermanic(value string) bool {
function isSilentStart (line 63) | func isSilentStart(input runestring) bool {
function handleVowel (line 77) | func handleVowel(result *metaphoneresult, index int) int {
function handleC (line 88) | func handleC(input runestring, result *metaphoneresult, index int) int {
function handleCC (line 143) | func handleCC(input runestring, result *metaphoneresult, index int) int {
function handleCH (line 160) | func handleCH(input runestring, result *metaphoneresult, index int) int {
function handleD (line 185) | func handleD(input runestring, result *metaphoneresult, index int) int {
function handleG (line 204) | func handleG(input runestring, result *metaphoneresult, index int, slavo...
function handleGH (line 252) | func handleGH(input runestring, result *metaphoneresult, index int) int {
function handleH (line 279) | func handleH(input runestring, result *metaphoneresult, index int) int {
function handleJ (line 290) | func handleJ(input runestring, result *metaphoneresult, index int, slavo...
function handleL (line 322) | func handleL(input runestring, result *metaphoneresult, index int) int {
function handleP (line 337) | func handleP(input runestring, result *metaphoneresult, index int) int {
function handleR (line 352) | func handleR(input runestring, result *metaphoneresult, index int, slavo...
function handleS (line 369) | func handleS(input runestring, result *metaphoneresult, index int, slavo...
function handleSC (line 417) | func handleSC(input runestring, result *metaphoneresult, index int) int {
function handleT (line 442) | func handleT(input runestring, result *metaphoneresult, index int) int {
function handleW (line 469) | func handleW(input runestring, result *metaphoneresult, index int) int {
function handleX (line 497) | func handleX(input runestring, result *metaphoneresult, index int) int {
function handleZ (line 517) | func handleZ(input runestring, result *metaphoneresult, index int, slavo...
function conditionC0 (line 540) | func conditionC0(input runestring, index int) bool {
function conditionCH0 (line 557) | func conditionCH0(input runestring, index int) bool {
function conditionCH1 (line 570) | func conditionCH1(input runestring, index int) bool {
function conditionL0 (line 580) | func conditionL0(input runestring, index int) bool {
function conditionM0 (line 593) | func conditionM0(input runestring, index int) bool {
function DoubleMetaphone (line 610) | func DoubleMetaphone(s1 string) (string, string) {
FILE: metaphone_test.go
function TestDoubleMetaphone (line 11) | func TestDoubleMetaphone(t *testing.T) {
FILE: nysiis.go
function NYSIIS (line 5) | func NYSIIS(s1 string) string {
FILE: nysiis_test.go
function TestNYIIS (line 48) | func TestNYIIS(t *testing.T) {
FILE: osa.go
function OSA (line 9) | func OSA(s1 string, s2 string) (distance int) {
FILE: osa_test.go
function TestOSA (line 36) | func TestOSA(t *testing.T) {
FILE: phonex.go
function preProcess (line 3) | func preProcess(input []rune) []rune {
function Phonex (line 67) | func Phonex(s1 string) string {
FILE: phonex_test.go
function TestPhonex (line 71) | func TestPhonex(t *testing.T) {
FILE: runestring.go
type runestring (line 3) | type runestring
method SafeAt (line 7) | func (r *runestring) SafeAt(pos int) rune {
method SafeSubstr (line 17) | func (r *runestring) SafeSubstr(pos int, length int) string {
method Del (line 27) | func (r *runestring) Del(pos ...int) {
method Contains (line 36) | func (r *runestring) Contains(start int, length int, criteria ...strin...
FILE: smithwaterman.go
constant GAP_COST (line 3) | GAP_COST = float64(0.5)
function getCost (line 5) | func getCost(r1 []rune, r1Index int, r2 []rune, r2Index int) float64 {
function SmithWaterman (line 16) | func SmithWaterman(s1 string, s2 string) float64 {
FILE: smithwaterman_test.go
function TestSmithWaterman (line 34) | func TestSmithWaterman(t *testing.T) {
FILE: soundex.go
function Soundex (line 8) | func Soundex(s1 string) string {
FILE: soundex_test.go
function TestSoundex (line 47) | func TestSoundex(t *testing.T) {
FILE: utf8.go
type String (line 21) | type String struct
method Init (line 38) | func (s *String) Init(contents string) *String {
method String (line 60) | func (s *String) String() string {
method RuneCount (line 65) | func (s *String) RuneCount() int {
method IsASCII (line 70) | func (s *String) IsASCII() bool {
method Slice (line 75) | func (s *String) Slice(i, j int) string {
method At (line 109) | func (s *String) At(i int) int {
function NewString (line 32) | func NewString(contents string) *String {
FILE: util.go
function min (line 9) | func min(a int, b int) (res int) {
function maxI (line 20) | func maxI(a int, b int) (res int) {
function max (line 31) | func max(a float64, b float64) (res float64) {
function nan (line 42) | func nan(c rune) bool {
function round (line 52) | func round(x float64, prec int) float64 {
function contains (line 78) | func contains(value *String, start int, length int, criteria ...string) ...
function substring (line 91) | func substring(value *String, start int, length int) string {
function isVowel (line 99) | func isVowel(c rune) bool {
function isVowelNoY (line 108) | func isVowelNoY(c rune) bool {
function cleanInput (line 117) | func cleanInput(input string) string {
Condensed preview — 28 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (67K chars).
[
{
"path": "COPYING.txt",
"chars": 854,
"preview": "Matchr: an approximate string matching library for the Go programming language\n\nCopyright (C) 2013-2014 Ant Zucaro\n\nThis"
},
{
"path": "README.md",
"chars": 737,
"preview": "# matchr\n\n[](https://pkg.go.dev/github.com/antz"
},
{
"path": "damerau_levenshtein.go",
"chars": 2819,
"preview": "package matchr\n\n// DamerauLevenshtein computes the Damerau-Levenshtein distance between two\n// strings. The returned val"
},
{
"path": "damerau_levenshtein_test.go",
"chars": 933,
"preview": "package matchr\n\nimport \"testing\"\n\nvar damlevtests = []struct {\n\ts1 string\n\ts2 string\n\tdist int\n}{\n\t// insertion\n\t{\"c"
},
{
"path": "go.mod",
"chars": 44,
"preview": "module github.com/antzucaro/matchr\n\ngo 1.13\n"
},
{
"path": "hamming.go",
"chars": 639,
"preview": "package matchr\n\nimport \"errors\"\n\n// Hamming computes the Hamming distance between two equal-length strings.\n// This is t"
},
{
"path": "hamming_test.go",
"chars": 678,
"preview": "package matchr\n\nimport \"testing\"\n\nvar hamtests = []struct {\n\ts1 string\n\ts2 string\n\tdist int\n\terr bool\n}{\n\t{\"\", \"\", "
},
{
"path": "jarowinkler.go",
"chars": 3040,
"preview": "package matchr\n\nfunc jaroWinklerBase(s1 string, s2 string,\n\tlongTolerance bool, winklerize bool) (distance float64) {\n\n\t"
},
{
"path": "jarowinkler_test.go",
"chars": 1129,
"preview": "package matchr\n\nimport \"testing\"\n\nvar jarotests = []struct {\n\ts1 string\n\ts2 string\n\tdist float64\n}{\n\t{\"\", \"cars\", 0."
},
{
"path": "levenshtein.go",
"chars": 1014,
"preview": "package matchr\n\n// Levenshtein computes the Levenshtein distance between two\n// strings. The returned value - distance -"
},
{
"path": "levenshtein_test.go",
"chars": 745,
"preview": "package matchr\n\nimport \"testing\"\n\nvar levtests = []struct {\n\ts1 string\n\ts2 string\n\tdist int\n}{\n\t// insertion\n\t{\"car\""
},
{
"path": "longestcommonsubsequence.go",
"chars": 708,
"preview": "package matchr\n\n// LongestCommonSubsequence computes the longest substring\n// between two strings. The returned value is"
},
{
"path": "longestcommonsubsequence_test.go",
"chars": 656,
"preview": "package matchr\n\nimport \"testing\"\n\nvar lcstests = []struct {\n\ts1 string\n\ts2 string\n\tlength int\n}{\n\t// match begin"
},
{
"path": "metaphone.go",
"chars": 17900,
"preview": "package matchr\n\nimport (\n\t\"bytes\"\n\t\"strings\"\n)\n\ntype metaphoneresult struct {\n\t// the maximum number of code values to c"
},
{
"path": "metaphone_test.go",
"chars": 850,
"preview": "package matchr\n\nimport (\n\t\"bufio\"\n\t\"compress/gzip\"\n\t\"os\"\n\t\"strings\"\n\t\"testing\"\n)\n\nfunc TestDoubleMetaphone(t *testing.T)"
},
{
"path": "nysiis.go",
"chars": 3190,
"preview": "package matchr\n\n// NYSIIS computes the NYSIIS phonetic encoding of the input string. It is a\n// modification of the trad"
},
{
"path": "nysiis_test.go",
"chars": 1121,
"preview": "package matchr\n\nimport \"testing\"\n\nvar nysiistests = []struct {\n\ts1 string\n\tnysiis string\n}{\n\t{\"knight\", \"NAGT\"},\n\t{\""
},
{
"path": "osa.go",
"chars": 1265,
"preview": "package matchr\n\n// OSA computes the Optimal String Alignment distance between two\n// strings. The returned value - dista"
},
{
"path": "osa_test.go",
"chars": 894,
"preview": "package matchr\n\nimport \"testing\"\n\nvar osatests = []struct {\n\ts1 string\n\ts2 string\n\tdist int\n}{\n\t// insertion\n\t{\"car\""
},
{
"path": "phonex.go",
"chars": 2788,
"preview": "package matchr\n\nfunc preProcess(input []rune) []rune {\n\toutput := runestring(make([]rune, 0, len(input)))\n\n\t// 0. Remove"
},
{
"path": "phonex_test.go",
"chars": 1625,
"preview": "package matchr\n\nimport \"testing\"\n\n// test cases from http://rosettacode.org/wiki/phonex#F.23\nvar phonextests = []struct "
},
{
"path": "runestring.go",
"chars": 1167,
"preview": "package matchr\n\ntype runestring []rune\n\n// A safe way to index a runestring. It will return a null rune if you try\n// to"
},
{
"path": "smithwaterman.go",
"chars": 1729,
"preview": "package matchr\n\nconst GAP_COST = float64(0.5)\n\nfunc getCost(r1 []rune, r1Index int, r2 []rune, r2Index int) float64 {\n\ti"
},
{
"path": "smithwaterman_test.go",
"chars": 858,
"preview": "package matchr\n\nimport \"testing\"\n\nvar swtests = []struct {\n\ts1 string\n\ts2 string\n\tdist float64\n}{\n\t// insertion\n\t{\"c"
},
{
"path": "soundex.go",
"chars": 1438,
"preview": "package matchr\n\nimport \"strings\"\n\n// Soundex computes the Soundex phonetic representation of the input string. It\n// att"
},
{
"path": "soundex_test.go",
"chars": 1122,
"preview": "package matchr\n\nimport \"testing\"\n\n// test cases from http://rosettacode.org/wiki/Soundex#F.23\nvar soundextests = []struc"
},
{
"path": "utf8.go",
"chars": 5849,
"preview": "// Copyright 2009 The Go Authors. All rights reserved.\n// Use of this source code is governed by a BSD-style\n// license "
},
{
"path": "util.go",
"chars": 2073,
"preview": "package matchr\n\nimport (\n\t\"math\"\n\t\"strings\"\n)\n\n// min of two integers\nfunc min(a int, b int) (res int) {\n\tif a < b {\n\t\tr"
}
]
About this extraction
This page contains the full source code of the antzucaro/matchr GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 28 files (56.5 KB), approximately 20.5k tokens, and a symbol index with 81 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.