[
  {
    "path": "COPYING.txt",
    "content": "Matchr: an approximate string matching library for the Go programming language\n\nCopyright (C) 2013-2014 Ant Zucaro\n\nThis program is free software; you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation; either version 2 of the License, or\n(at your option) any later version.\n\nThis program is distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\nGNU General Public License for more details.\n\nYou should have received a copy of the GNU General Public License\nalong with this program; if not, write to the Free Software\nFoundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA\n\nYou can contact Ant Zucaro at azucaro at gmail dot com.\n"
  },
  {
    "path": "README.md",
    "content": "# matchr\n\n[![Go Reference](https://pkg.go.dev/badge/github.com/antzucaro/matchr.svg)](https://pkg.go.dev/github.com/antzucaro/matchr)\n\nAn approximate string matching library for the [Go programming language](http://www.golang.org).\n\n## Rationale\n\nData used in record linkage can often be of dubious quality. Typographical \nerrors or changing data elements (to name a few things) make establishing similarity between two sets of data \ndifficult. Rather than use exact string comparison in such situations, it is\nvital to have a means to identify how similar two strings are. Similarity functions can cater\nto certain data sets in order to make better matching decisions. The matchr library provides\nseveral of these similarity functions.\n"
  },
  {
    "path": "damerau_levenshtein.go",
    "content": "package matchr\n\n// DamerauLevenshtein computes the Damerau-Levenshtein distance between two\n// strings. The returned value - distance - is the number of insertions,\n// deletions, substitutions, and transpositions it takes to transform one\n// string (s1) into another (s2). Each step in the transformation \"costs\"\n// one distance point. It is similar to the Optimal String Alignment,\n// algorithm, but is more complex because it allows multiple edits on\n// substrings.\n//\n// This implementation is based off of the one found on Wikipedia at\n// http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions\n// as well as KevinStern's Java implementation found at\n// https://github.com/KevinStern/software-and-algorithms.\nfunc DamerauLevenshtein(s1 string, s2 string) (distance int) {\n\t// index by code point, not byte\n\tr1 := []rune(s1)\n\tr2 := []rune(s2)\n\n\t// the maximum possible distance\n\tinf := len(r1) + len(r2)\n\n\t// if one string is blank, we needs insertions\n\t// for all characters in the other one\n\tif len(r1) == 0 {\n\t\treturn len(r2)\n\t}\n\n\tif len(r2) == 0 {\n\t\treturn len(r1)\n\t}\n\n\t// construct the edit-tracking matrix\n\tmatrix := make([][]int, len(r1))\n\tfor i := range matrix {\n\t\tmatrix[i] = make([]int, len(r2))\n\t}\n\n\t// seen characters\n\tseenRunes := make(map[rune]int)\n\n\tif r1[0] != r2[0] {\n\t\tmatrix[0][0] = 1\n\t}\n\n\tseenRunes[r1[0]] = 0\n\tfor i := 1; i < len(r1); i++ {\n\t\tdeleteDist := matrix[i-1][0] + 1\n\t\tinsertDist := (i+1)*1 + 1\n\t\tvar matchDist int\n\t\tif r1[i] == r2[0] {\n\t\t\tmatchDist = i\n\t\t} else {\n\t\t\tmatchDist = i + 1\n\t\t}\n\t\tmatrix[i][0] = min(min(deleteDist, insertDist), matchDist)\n\t}\n\n\tfor j := 1; j < len(r2); j++ {\n\t\tdeleteDist := (j + 1) * 2\n\t\tinsertDist := matrix[0][j-1] + 1\n\t\tvar matchDist int\n\t\tif r1[0] == r2[j] {\n\t\t\tmatchDist = j\n\t\t} else {\n\t\t\tmatchDist = j + 1\n\t\t}\n\n\t\tmatrix[0][j] = min(min(deleteDist, insertDist), matchDist)\n\t}\n\n\tfor i := 1; i < len(r1); i++ {\n\t\tvar maxSrcMatchIndex int\n\t\tif r1[i] == r2[0] {\n\t\t\tmaxSrcMatchIndex = 0\n\t\t} else {\n\t\t\tmaxSrcMatchIndex = -1\n\t\t}\n\n\t\tfor j := 1; j < len(r2); j++ {\n\t\t\tswapIndex, ok := seenRunes[r2[j]]\n\t\t\tjSwap := maxSrcMatchIndex\n\t\t\tdeleteDist := matrix[i-1][j] + 1\n\t\t\tinsertDist := matrix[i][j-1] + 1\n\t\t\tmatchDist := matrix[i-1][j-1]\n\t\t\tif r1[i] != r2[j] {\n\t\t\t\tmatchDist += 1\n\t\t\t} else {\n\t\t\t\tmaxSrcMatchIndex = j\n\t\t\t}\n\n\t\t\t// for transpositions\n\t\t\tvar swapDist int\n\t\t\tif ok && jSwap != -1 {\n\t\t\t\tiSwap := swapIndex\n\t\t\t\tvar preSwapCost int\n\t\t\t\tif iSwap == 0 && jSwap == 0 {\n\t\t\t\t\tpreSwapCost = 0\n\t\t\t\t} else {\n\t\t\t\t\tpreSwapCost = matrix[maxI(0, iSwap-1)][maxI(0, jSwap-1)]\n\t\t\t\t}\n\t\t\t\tswapDist = i + j + preSwapCost - iSwap - jSwap - 1\n\t\t\t} else {\n\t\t\t\tswapDist = inf\n\t\t\t}\n\t\t\tmatrix[i][j] = min(min(min(deleteDist, insertDist), matchDist), swapDist)\n\t\t}\n\t\tseenRunes[r1[i]] = i\n\t}\n\n\treturn matrix[len(r1)-1][len(r2)-1]\n}\n"
  },
  {
    "path": "damerau_levenshtein_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\nvar damlevtests = []struct {\n\ts1   string\n\ts2   string\n\tdist int\n}{\n\t// insertion\n\t{\"car\", \"cars\", 1},\n\t// substitution\n\t{\"library\", \"librari\", 1},\n\t// deletion\n\t{\"library\", \"librar\", 1},\n\t// transposition\n\t{\"library\", \"librayr\", 1},\n\t// one empty, left\n\t{\"\", \"library\", 7},\n\t// one empty, right\n\t{\"library\", \"\", 7},\n\t// two empties\n\t{\"\", \"\", 0},\n\t// unicode stuff!\n\t{\"Schüßler\", \"Schübler\", 1},\n\t{\"Schüßler\", \"Schußler\", 1},\n\t{\"Schüßler\", \"Schüßler\", 0},\n\t{\"Schßüler\", \"Schüßler\", 1},\n\t{\"Schüßler\", \"Schüler\", 1},\n\t{\"Schüßler\", \"Schüßlers\", 1},\n\t// difference between DL and OSA. This is DL, so it should be 2.\n\t{\"ca\", \"abc\", 2},\n}\n\n// Damerau-Levenshtein\nfunc TestDamerauLevenshtein(t *testing.T) {\n\tfor _, tt := range damlevtests {\n\t\tdist := DamerauLevenshtein(tt.s1, tt.s2)\n\t\tif dist != tt.dist {\n\t\t\tt.Errorf(\"DamerauLevenshtein('%s', '%s') = %v, want %v\", tt.s1, tt.s2, dist, tt.dist)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "go.mod",
    "content": "module github.com/antzucaro/matchr\n\ngo 1.13\n"
  },
  {
    "path": "hamming.go",
    "content": "package matchr\n\nimport \"errors\"\n\n// Hamming computes the Hamming distance between two equal-length strings.\n// This is the number of times the two strings differ between characters at\n// the same index. This implementation is based off of the algorithm\n// description found at http://en.wikipedia.org/wiki/Hamming_distance.\nfunc Hamming(s1 string, s2 string) (distance int, err error) {\n\t// index by code point, not byte\n\tr1 := []rune(s1)\n\tr2 := []rune(s2)\n\n\tif len(r1) != len(r2) {\n\t\terr = errors.New(\"Hamming distance of different sized strings.\")\n\t\treturn\n\t}\n\n\tfor i, v := range r1 {\n\t\tif r2[i] != v {\n\t\t\tdistance += 1\n\t\t}\n\t}\n\treturn\n}\n"
  },
  {
    "path": "hamming_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\nvar hamtests = []struct {\n\ts1   string\n\ts2   string\n\tdist int\n\terr  bool\n}{\n\t{\"\", \"\", 0, false},\n\t{\"cat\", \"cat\", 0, false},\n\t{\"car\", \"cat\", 1, false},\n\t{\"tar\", \"car\", 1, false},\n\t{\"xyz\", \"zyx\", 2, false},\n\t{\"wxyz\", \"zyx\", 0, true},\n\t{\"Schüßler\", \"Schübler\", 1, false},\n\t{\"Schüßler\", \"Schußler\", 1, false},\n}\n\n// Hamming Distance\nfunc TestHamming(t *testing.T) {\n\tfor _, tt := range hamtests {\n\t\tdist, err := Hamming(tt.s1, tt.s2)\n\t\tif dist != tt.dist {\n\t\t\tt.Errorf(\"Hamming('%s', '%s') = %v, want %v\", tt.s1, tt.s2, dist, tt.dist)\n\t\t}\n\n\t\tif tt.err && err == nil {\n\t\t\tt.Errorf(\"Hamming('%s', '%s') should throw an error\", tt.s1, tt.s2)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "jarowinkler.go",
    "content": "package matchr\n\nfunc jaroWinklerBase(s1 string, s2 string,\n\tlongTolerance bool, winklerize bool) (distance float64) {\n\n\t// index by code point, not byte\n\tr1 := []rune(s1)\n\tr2 := []rune(s2)\n\n\tr1Length := len(r1)\n\tr2Length := len(r2)\n\n\tif r1Length == 0 || r2Length == 0 {\n\t\treturn\n\t}\n\n\tminLength := 0\n\tif r1Length > r2Length {\n\t\tminLength = r1Length\n\t} else {\n\t\tminLength = r2Length\n\t}\n\n\tsearchRange := minLength\n\tsearchRange = (searchRange / 2) - 1\n\tif searchRange < 0 {\n\t\tsearchRange = 0\n\t}\n\tvar lowLim, hiLim, transCount, commonChars int\n\tvar i, j, k int\n\n\tr1Flag := make([]bool, r1Length+1)\n\tr2Flag := make([]bool, r2Length+1)\n\n\t// find the common chars within the acceptable range\n\tcommonChars = 0\n\tfor i, _ = range r1 {\n\t\tif i >= searchRange {\n\t\t\tlowLim = i - searchRange\n\t\t} else {\n\t\t\tlowLim = 0\n\t\t}\n\n\t\tif (i + searchRange) <= (r2Length - 1) {\n\t\t\thiLim = i + searchRange\n\t\t} else {\n\t\t\thiLim = r2Length - 1\n\t\t}\n\n\t\tfor j := lowLim; j <= hiLim; j++ {\n\t\t\tif !r2Flag[j] && r2[j] == r1[i] {\n\t\t\t\tr2Flag[j] = true\n\t\t\t\tr1Flag[i] = true\n\t\t\t\tcommonChars++\n\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t}\n\n\t// if we have nothing in common at this point, nothing else can be done\n\tif commonChars == 0 {\n\t\treturn\n\t}\n\n\t// otherwise we count the transpositions\n\tk = 0\n\ttransCount = 0\n\tfor i, _ := range r1 {\n\t\tif r1Flag[i] {\n\t\t\tfor j = k; j < r2Length; j++ {\n\t\t\t\tif r2Flag[j] {\n\t\t\t\t\tk = j + 1\n\t\t\t\t\tbreak\n\t\t\t\t}\n\t\t\t}\n\t\t\tif r1[i] != r2[j] {\n\t\t\t\ttransCount++\n\t\t\t}\n\t\t}\n\t}\n\ttransCount /= 2\n\n\t// adjust for similarities in nonmatched characters\n\tdistance = float64(commonChars)/float64(r1Length) +\n\t\tfloat64(commonChars)/float64(r2Length) +\n\t\t(float64(commonChars-transCount))/float64(commonChars)\n\tdistance /= 3.0\n\n\t// give more weight to already-similar strings\n\tif winklerize && distance > 0.7 {\n\n\t\t// the first 4 characters in common\n\t\tif minLength >= 4 {\n\t\t\tj = 4\n\t\t} else {\n\t\t\tj = minLength\n\t\t}\n\n\t\tfor i = 0; i < j && len(r1) > i && len(r2) > i && r1[i] == r2[i] && nan(r1[i]); i++ {\n\t\t}\n\n\t\tif i > 0 {\n\t\t\tdistance += float64(i) * 0.1 * (1.0 - distance)\n\t\t}\n\n\t\tif longTolerance && (minLength > 4) && (commonChars > i+1) &&\n\t\t\t(2*commonChars >= minLength+i) {\n\t\t\tif nan(r1[0]) {\n\t\t\t\tdistance += (1.0 - distance) * (float64(commonChars-i-1) /\n\t\t\t\t\t(float64(r1Length) + float64(r2Length) - float64(i*2) + 2))\n\t\t\t}\n\t\t}\n\t}\n\n\treturn\n}\n\n// Jaro computes the Jaro edit distance between two strings. It represents\n// this with a float64 between 0 and 1 inclusive, with 0 indicating the two\n// strings are not at all similar and 1 indicating the two strings are exact\n// matches.\n//\n// See http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance for a\n// full description.\nfunc Jaro(r1 string, r2 string) (distance float64) {\n\treturn jaroWinklerBase(r1, r2, false, false)\n}\n\n// JaroWinkler computes the Jaro-Winkler edit distance between two strings.\n// This is a modification of the Jaro algorithm that gives additional weight\n// to prefix matches.\nfunc JaroWinkler(r1 string, r2 string, longTolerance bool) (distance float64) {\n\treturn jaroWinklerBase(r1, r2, longTolerance, true)\n}\n"
  },
  {
    "path": "jarowinkler_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\nvar jarotests = []struct {\n\ts1   string\n\ts2   string\n\tdist float64\n}{\n\t{\"\", \"cars\", 0.0},\n\t{\"cars\", \"\", 0.0},\n\t{\"car\", \"cars\", 0.9166666666666666},\n\t{\"dixon\", \"dicksonx\", 0.7666666666666666},\n\t{\"martha\", \"marhta\", 0.9444444444444445},\n\t{\"dwayne\", \"duane\", 0.8222222222222223},\n\t{\"martüa\", \"marüta\", 0.9444444444444445},\n\t{\"dr\", \"driveway\", 0.75},\n}\n\n// Regular Jaro distance\nfunc TestJaro(t *testing.T) {\n\tfor _, tt := range jarotests {\n\t\tdist := Jaro(tt.s1, tt.s2)\n\t\tif dist != tt.dist {\n\t\t\tt.Errorf(\"Jaro('%s', '%s') = %v, want %v\", tt.s1, tt.s2, dist, tt.dist)\n\t\t}\n\t}\n}\n\nvar jarowtests = []struct {\n\ts1   string\n\ts2   string\n\tdist float64\n}{\n\t{\"\", \"cars\", 0.0},\n\t{\"cars\", \"\", 0.0},\n\t{\"dixon\", \"dicksonx\", 0.8133333333333332},\n\t{\"martha\", \"marhta\", 0.9611111111111111},\n\t{\"dwayne\", \"duane\", 0.8400000000000001},\n\t{\"dr\", \"driveway\", 0.8},\n}\n\n// Jaro-Winkler distance\nfunc TestJaroWinkler(t *testing.T) {\n\tfor _, tt := range jarowtests {\n\t\tdist := JaroWinkler(tt.s1, tt.s2, false)\n\t\tif dist != tt.dist {\n\t\t\tt.Errorf(\"JaroWinkler('%s', '%s') = %v, want %v\", tt.s1, tt.s2, dist, tt.dist)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "levenshtein.go",
    "content": "package matchr\n\n// Levenshtein computes the Levenshtein distance between two\n// strings. The returned value - distance - is the number of insertions,\n// deletions, and substitutions it takes to transform one\n// string (s1) into another (s2). Each step in the transformation \"costs\"\n// one distance point.\nfunc Levenshtein(s1 string, s2 string) (distance int) {\n\t// index by code point, not byte\n\tr1 := []rune(s1)\n\tr2 := []rune(s2)\n\n\trows := len(r1) + 1\n\tcols := len(r2) + 1\n\n\tvar d1 int\n\tvar d2 int\n\tvar d3 int\n\tvar i int\n\tvar j int\n\tdist := make([]int, rows*cols)\n\n\tfor i = 0; i < rows; i++ {\n\t\tdist[i*cols] = i\n\t}\n\n\tfor j = 0; j < cols; j++ {\n\t\tdist[j] = j\n\t}\n\n\tfor j = 1; j < cols; j++ {\n\t\tfor i = 1; i < rows; i++ {\n\t\t\tif r1[i-1] == r2[j-1] {\n\t\t\t\tdist[(i*cols)+j] = dist[((i-1)*cols)+(j-1)]\n\t\t\t} else {\n\t\t\t\td1 = dist[((i-1)*cols)+j] + 1\n\t\t\t\td2 = dist[(i*cols)+(j-1)] + 1\n\t\t\t\td3 = dist[((i-1)*cols)+(j-1)] + 1\n\n\t\t\t\tdist[(i*cols)+j] = min(d1, min(d2, d3))\n\t\t\t}\n\t\t}\n\t}\n\n\tdistance = dist[(cols*rows)-1]\n\n\treturn\n}\n"
  },
  {
    "path": "levenshtein_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\nvar levtests = []struct {\n\ts1   string\n\ts2   string\n\tdist int\n}{\n\t// insertion\n\t{\"car\", \"cars\", 1},\n\t// substitution\n\t{\"library\", \"librari\", 1},\n\t// deletion\n\t{\"library\", \"librar\", 1},\n\t// one empty, left\n\t{\"\", \"library\", 7},\n\t// one empty, right\n\t{\"library\", \"\", 7},\n\t// two empties\n\t{\"\", \"\", 0},\n\t// unicode stuff!\n\t{\"Schüßler\", \"Schübler\", 1},\n\t{\"Schüßler\", \"Schußler\", 1},\n\t{\"Schüßler\", \"Schüßler\", 0},\n\t{\"Schüßler\", \"Schüler\", 1},\n\t{\"Schüßler\", \"Schüßlers\", 1},\n}\n\n// Regular Levenshtein\nfunc TestLevenshtein(t *testing.T) {\n\tfor _, tt := range levtests {\n\t\tdist := Levenshtein(tt.s1, tt.s2)\n\t\tif dist != tt.dist {\n\t\t\tt.Errorf(\"Levenshtein('%s', '%s') = %v, want %v\", tt.s1, tt.s2, dist, tt.dist)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "longestcommonsubsequence.go",
    "content": "package matchr\n\n// LongestCommonSubsequence computes the longest substring\n// between two strings. The returned value is the length\n// of the substring, which contains letters from both\n// strings, while maintaining the order of the letters.\nfunc LongestCommonSubsequence(s1, s2 string) int {\n\tr1 := []rune(s1)\n\tr2 := []rune(s2)\n\ttable := make([][]int, len(s1)+1)\n\n\t// Construct 2D table\n\tfor i := range table {\n\t\ttable[i] = make([]int, len(s2)+1)\n\t}\n\n\tvar i int\n\tvar j int\n\n\tfor i = len(r1) - 1; i >= 0; i-- {\n\t\tfor j = len(r2) - 1; j >= 0; j-- {\n\t\t\tif r1[i] == r2[j] {\n\t\t\t\ttable[i][j] = 1 + table[i+1][j+1]\n\t\t\t} else {\n\t\t\t\ttable[i][j] = maxI(table[i+1][j], table[i][j+1])\n\t\t\t}\n\t\t}\n\t}\n\treturn table[0][0]\n}\n"
  },
  {
    "path": "longestcommonsubsequence_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\nvar lcstests = []struct {\n\ts1     string\n\ts2     string\n\tlength int\n}{\n\t// match beginning\n\t{\"cans\", \"can\", 3},\n\t// match end\n\t{\"ebay\", \"bay\", 3},\n\t// gap in the middle\n\t{\"coins\", \"cons\", 4},\n\t// one empty, left\n\t{\"\", \"hello\", 0},\n\t// one empty, right\n\t{\"goodbye\", \"\", 0},\n\t// two empties\n\t{\"\", \"\", 0},\n\t// unicode stuff!\n\t{\"Schüßler\", \"Schüßler\", 8},\n}\n\nfunc TestLongestCommonSubsequence(t *testing.T) {\n\tfor _, tt := range lcstests {\n\t\tlength := LongestCommonSubsequence(tt.s1, tt.s2)\n\t\tif length != tt.length {\n\t\t\tt.Errorf(\"LongestCommonSubsequence('%s', '%s') = %v, want %v\", tt.s1, tt.s2, length, tt.length)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "metaphone.go",
    "content": "package matchr\n\nimport (\n\t\"bytes\"\n\t\"strings\"\n)\n\ntype metaphoneresult struct {\n\t// the maximum number of code values to calculate\n\tmaxLength int\n\n\t// whether to calculate an alternate\n\tcalcAlternate bool\n\n\t// no direct modifications - only through add()\n\tprimary   bytes.Buffer\n\talternate bytes.Buffer\n\n\t// length of the private buffers\n\tPrimaryLength   int\n\tAlternateLength int\n}\n\nfunc newMetaphoneresult(maxLength int, calcAlternate bool) (r *metaphoneresult) {\n\tr = &metaphoneresult{maxLength: maxLength, calcAlternate: calcAlternate}\n\treturn\n}\n\nfunc (r *metaphoneresult) add(c1 string, c2 string) {\n\tif c1 != \"\" {\n\t\tr.primary.WriteString(c1)\n\t\tr.PrimaryLength += len(c1)\n\t}\n\n\tif c2 != \"\" && r.calcAlternate {\n\t\tr.alternate.WriteString(c2)\n\t\tr.AlternateLength += len(c2)\n\t}\n}\n\nfunc (r *metaphoneresult) isComplete() bool {\n\treturn r.PrimaryLength >= r.maxLength && r.AlternateLength >= r.maxLength\n}\n\nfunc (r *metaphoneresult) result() (primary string, alternate string) {\n\tprimary = r.primary.String()\n\tif len(primary) > r.maxLength {\n\t\tprimary = primary[0:r.maxLength]\n\t}\n\talternate = r.alternate.String()\n\tif len(alternate) > r.maxLength {\n\t\talternate = alternate[0:r.maxLength]\n\t}\n\treturn\n}\n\n// utility functions for checking things within a string\nfunc isSlavoGermanic(value string) bool {\n\treturn strings.Contains(value, \"W\") || strings.Contains(value, \"K\") ||\n\t\tstrings.Contains(value, \"CZ\") || strings.Contains(value, \"WITZ\")\n}\n\nfunc isSilentStart(input runestring) bool {\n\tSILENT_START := [...]string{\"GN\", \"KN\", \"PN\", \"WR\", \"PS\"}\n\n\tprefix := input.SafeSubstr(0, 2)\n\n\tfor _, criteria := range SILENT_START {\n\t\tif prefix == criteria {\n\t\t\treturn true\n\t\t}\n\t}\n\n\treturn false\n}\n\nfunc handleVowel(result *metaphoneresult, index int) int {\n\tif index == 0 {\n\t\tresult.add(\"A\", \"A\")\n\t}\n\n\treturn index + 1\n}\n\n/******************************************************************************\n * Entry handlers for letters.\n *****************************************************************************/\nfunc handleC(input runestring, result *metaphoneresult, index int) int {\n\tif conditionC0(input, index) {\n\t\tresult.add(\"K\", \"K\")\n\t\tindex += 2\n\t} else if index == 0 && input.Contains(index, 6, \"CAESAR\") {\n\t\tresult.add(\"S\", \"S\")\n\t\tindex += 2\n\t} else if input.Contains(index, 2, \"CH\") {\n\t\tindex = handleCH(input, result, index)\n\t} else if input.Contains(index, 2, \"CZ\") &&\n\t\t!input.Contains(index-2, 4, \"WICZ\") {\n\t\tresult.add(\"S\", \"X\")\n\t\tindex += 2\n\t} else if input.Contains(index+1, 3, \"CIA\") {\n\t\tresult.add(\"X\", \"X\")\n\t\tindex += 3\n\t} else if input.Contains(index, 2, \"CC\") &&\n\t\t!(index == 1 && input.SafeAt(0) == 'M') {\n\t\treturn handleCC(input, result, index)\n\t} else if input.Contains(index, 2, \"CK\") ||\n\t\tinput.Contains(index, 2, \"CG\") ||\n\t\tinput.Contains(index, 2, \"CQ\") {\n\t\tresult.add(\"K\", \"K\")\n\t\tindex += 2\n\t} else if input.Contains(index, 2, \"CI\") ||\n\t\tinput.Contains(index, 2, \"CE\") ||\n\t\tinput.Contains(index, 2, \"CY\") {\n\t\tif input.Contains(index, 3, \"CIO\") ||\n\t\t\tinput.Contains(index, 3, \"CIE\") ||\n\t\t\tinput.Contains(index, 3, \"CIA\") {\n\t\t\tresult.add(\"S\", \"X\")\n\t\t} else {\n\t\t\tresult.add(\"S\", \"S\")\n\t\t}\n\t\tindex += 2\n\t} else {\n\t\tresult.add(\"K\", \"K\")\n\t\tif input.Contains(index+1, 2, \" C\") ||\n\t\t\tinput.Contains(index+1, 2, \" Q\") ||\n\t\t\tinput.Contains(index+1, 2, \" G\") {\n\t\t\tindex += 3\n\t\t} else if (input.Contains(index+1, 1, \"C\") ||\n\t\t\tinput.Contains(index+1, 1, \"K\") ||\n\t\t\tinput.Contains(index+1, 1, \"Q\")) &&\n\t\t\t!(input.Contains(index+1, 2, \"CE\") ||\n\t\t\t\tinput.Contains(index+1, 2, \"CI\")) {\n\t\t\tindex += 2\n\t\t} else {\n\t\t\tindex++\n\t\t}\n\t}\n\n\treturn index\n}\n\nfunc handleCC(input runestring, result *metaphoneresult, index int) int {\n\tif input.Contains(index+2, 1, \"I\", \"E\", \"H\") &&\n\t\t!input.Contains(index+2, 2, \"HU\") {\n\t\tif (index == 1 && input.SafeAt(index-1) == 'A') ||\n\t\t\t(input.Contains(index-1, 5, \"UCCEE\", \"UCCES\")) {\n\t\t\tresult.add(\"KS\", \"KS\")\n\t\t} else {\n\t\t\tresult.add(\"X\", \"X\")\n\t\t}\n\t\tindex += 3\n\t} else {\n\t\tresult.add(\"K\", \"K\")\n\t\tindex += 2\n\t}\n\treturn index\n}\n\nfunc handleCH(input runestring, result *metaphoneresult, index int) int {\n\tif index > 0 && input.Contains(index, 4, \"CHAE\") {\n\t\tresult.add(\"K\", \"X\")\n\t\treturn index + 2\n\t} else if conditionCH0(input, index) {\n\t\tresult.add(\"K\", \"K\")\n\t\treturn index + 2\n\t\t// TODO: combine this condition with the one above?\n\t} else if conditionCH1(input, index) {\n\t\tresult.add(\"K\", \"K\")\n\t\treturn index + 2\n\t} else {\n\t\tif index > 0 {\n\t\t\tif input.Contains(0, 2, \"MC\") {\n\t\t\t\tresult.add(\"K\", \"K\")\n\t\t\t} else {\n\t\t\t\tresult.add(\"X\", \"K\")\n\t\t\t}\n\t\t} else {\n\t\t\tresult.add(\"X\", \"X\")\n\t\t}\n\t\treturn index + 2\n\t}\n}\n\nfunc handleD(input runestring, result *metaphoneresult, index int) int {\n\tif input.Contains(index, 2, \"DG\") {\n\t\tif input.Contains(index+2, 1, \"I\", \"E\", \"Y\") {\n\t\t\tresult.add(\"J\", \"J\")\n\t\t\tindex += 3\n\t\t} else {\n\t\t\tresult.add(\"TK\", \"TK\")\n\t\t\tindex += 2\n\t\t}\n\t} else if input.Contains(index, 2, \"DT\", \"DD\") {\n\t\tresult.add(\"T\", \"T\")\n\t\tindex += 2\n\t} else {\n\t\tresult.add(\"T\", \"T\")\n\t\tindex++\n\t}\n\treturn index\n}\n\nfunc handleG(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {\n\tif input.SafeAt(index+1) == 'H' {\n\t\tindex = handleGH(input, result, index)\n\t} else if input.SafeAt(index+1) == 'N' {\n\t\tif index == 1 && isVowel(input.SafeAt(0)) && !slavoGermanic {\n\t\t\tresult.add(\"KN\", \"N\")\n\t\t} else if !input.Contains(index+2, 2, \"EY\") && input.SafeAt(index+1) != 'Y' && !slavoGermanic {\n\t\t\tresult.add(\"N\", \"KN\")\n\t\t} else {\n\t\t\tresult.add(\"KN\", \"KN\")\n\t\t}\n\t\tindex += 2\n\t} else if input.Contains(index+1, 2, \"LI\") && !slavoGermanic {\n\t\tresult.add(\"KL\", \"L\")\n\t\tindex += 2\n\t} else if index == 0 && (input.SafeAt(index+1) == 'Y' ||\n\t\tinput.Contains(index+1, 2, \"ES\", \"EP\", \"EB\", \"EL\", \"EY\", \"IB\", \"IL\", \"IN\", \"IE\", \"EI\", \"ER\")) {\n\t\tresult.add(\"K\", \"J\")\n\t\tindex += 2\n\t} else if (input.Contains(index+1, 2, \"ER\") ||\n\t\tinput.SafeAt(index+1) == 'Y') &&\n\t\t!input.Contains(0, 6, \"DANGER\", \"RANGER\", \"MANGER\") &&\n\t\t!input.Contains(index-1, 1, \"E\", \"I\") &&\n\t\t!input.Contains(index-1, 3, \"RGY\", \"OGY\") {\n\t\tresult.add(\"K\", \"J\")\n\t\tindex += 2\n\t} else if input.Contains(index+1, 1, \"E\", \"I\", \"Y\") ||\n\t\tinput.Contains(index-1, 4, \"AGGI\", \"OGGI\") {\n\t\tif input.Contains(0, 4, \"VAN \", \"VON \") ||\n\t\t\tinput.Contains(0, 3, \"SCH\") ||\n\t\t\tinput.Contains(index+1, 2, \"ET\") {\n\t\t\tresult.add(\"K\", \"K\")\n\t\t} else if input.Contains(index+1, 3, \"IER\") {\n\t\t\tresult.add(\"J\", \"J\")\n\t\t} else {\n\t\t\tresult.add(\"J\", \"K\")\n\t\t}\n\t\tindex += 2\n\t} else if input.SafeAt(index+1) == 'G' {\n\t\tresult.add(\"K\", \"K\")\n\t\tindex += 2\n\t} else {\n\t\tresult.add(\"K\", \"K\")\n\t\tindex++\n\t}\n\treturn index\n}\n\nfunc handleGH(input runestring, result *metaphoneresult, index int) int {\n\tif index > 0 && !isVowel(input.SafeAt(index-1)) {\n\t\tresult.add(\"K\", \"K\")\n\t\tindex += 2\n\t} else if index == 0 {\n\t\tif input.SafeAt(index+2) == 'I' {\n\t\t\tresult.add(\"J\", \"J\")\n\t\t} else {\n\t\t\tresult.add(\"K\", \"K\")\n\t\t}\n\t\tindex += 2\n\t} else if (index > 1 && input.Contains(index-2, 1, \"B\", \"H\", \"D\")) ||\n\t\t(index > 2 && input.Contains(index-3, 1, \"B\", \"H\", \"D\")) ||\n\t\t(index > 3 && input.Contains(index-4, 1, \"B\", \"H\")) {\n\t\tindex += 2\n\t} else {\n\t\tif index > 2 && input.SafeAt(index-1) == 'U' &&\n\t\t\tinput.Contains(index-3, 1, \"C\", \"G\", \"L\", \"R\", \"T\") {\n\t\t\tresult.add(\"F\", \"F\")\n\t\t} else if index > 0 && input.SafeAt(index-1) != 'I' {\n\t\t\tresult.add(\"K\", \"K\")\n\t\t}\n\t\tindex += 2\n\t}\n\treturn index\n}\n\nfunc handleH(input runestring, result *metaphoneresult, index int) int {\n\tif (index == 0 || isVowel(input.SafeAt(index-1))) &&\n\t\tisVowel(input.SafeAt(index+1)) {\n\t\tresult.add(\"H\", \"H\")\n\t\tindex += 2\n\t} else {\n\t\tindex++\n\t}\n\treturn index\n}\n\nfunc handleJ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {\n\tif input.Contains(index, 4, \"JOSE\") || input.Contains(0, 4, \"SAN \") {\n\t\tif (index == 0 && (input.SafeAt(index+4) == ' ') ||\n\t\t\tlen(input) == 4) || input.Contains(0, 4, \"SAN \") {\n\t\t\tresult.add(\"H\", \"H\")\n\t\t} else {\n\t\t\tresult.add(\"J\", \"H\")\n\t\t}\n\t\tindex++\n\t} else {\n\t\tif index == 0 && !input.Contains(index, 4, \"JOSE\") {\n\t\t\tresult.add(\"J\", \"A\")\n\t\t} else if isVowel(input.SafeAt(index-1)) && !slavoGermanic &&\n\t\t\t(input.SafeAt(index+1) == 'A' || input.SafeAt(index+1) == 'O') {\n\t\t\tresult.add(\"J\", \"H\")\n\t\t} else if index == (len(input) - 1) {\n\t\t\tresult.add(\"J\", \" \")\n\t\t} else if !input.Contains(index+1, 1,\n\t\t\t\"L\", \"T\", \"K\", \"S\", \"N\", \"M\", \"B\", \"Z\") &&\n\t\t\t!input.Contains(index-1, 1, \"S\", \"K\", \"L\") {\n\t\t\tresult.add(\"J\", \"J\")\n\t\t}\n\n\t\tif input.SafeAt(index+1) == 'J' {\n\t\t\tindex += 2\n\t\t} else {\n\t\t\tindex++\n\t\t}\n\t}\n\treturn index\n}\n\nfunc handleL(input runestring, result *metaphoneresult, index int) int {\n\tif input.SafeAt(index+1) == 'L' {\n\t\tif conditionL0(input, index) {\n\t\t\tresult.add(\"L\", \"\")\n\t\t} else {\n\t\t\tresult.add(\"L\", \"L\")\n\t\t}\n\t\tindex += 2\n\t} else {\n\t\tresult.add(\"L\", \"L\")\n\t\tindex++\n\t}\n\treturn index\n}\n\nfunc handleP(input runestring, result *metaphoneresult, index int) int {\n\tif input.SafeAt(index+1) == 'H' {\n\t\tresult.add(\"F\", \"F\")\n\t\tindex += 2\n\t} else {\n\t\tresult.add(\"P\", \"P\")\n\t\tif input.Contains(index+1, 1, \"P\", \"B\") {\n\t\t\tindex += 2\n\t\t} else {\n\t\t\tindex++\n\t\t}\n\t}\n\treturn index\n}\n\nfunc handleR(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {\n\tif index == (len(input)-1) && !slavoGermanic &&\n\t\tinput.Contains(index-2, 2, \"IE\") &&\n\t\t!input.Contains(index-4, 2, \"ME\", \"MA\") {\n\t\tresult.add(\"\", \"R\")\n\t} else {\n\t\tresult.add(\"R\", \"R\")\n\t}\n\n\tif input.SafeAt(index+1) == 'R' {\n\t\tindex += 2\n\t} else {\n\t\tindex++\n\t}\n\treturn index\n}\n\nfunc handleS(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {\n\tif input.Contains(index-1, 3, \"ISL\", \"YSL\") {\n\t\tindex++\n\t} else if index == 0 && input.Contains(index, 5, \"SUGAR\") {\n\t\tresult.add(\"X\", \"S\")\n\t\tindex++\n\t} else if input.Contains(index, 2, \"SH\") {\n\t\tif input.Contains(index+1, 4, \"HEIM\", \"HOEK\", \"HOLM\", \"HOLZ\") {\n\t\t\tresult.add(\"S\", \"S\")\n\t\t} else {\n\t\t\tresult.add(\"X\", \"X\")\n\t\t}\n\t\tindex += 2\n\t} else if input.Contains(index, 3, \"SIO\", \"SIA\") ||\n\t\tinput.Contains(index, 4, \"SIAN\") {\n\t\tif slavoGermanic {\n\t\t\tresult.add(\"S\", \"S\")\n\t\t} else {\n\t\t\tresult.add(\"S\", \"X\")\n\t\t}\n\t\tindex += 3\n\t} else if (index == 0 && input.Contains(index+1, 1, \"M\", \"N\", \"L\", \"W\")) ||\n\t\tinput.Contains(index+1, 1, \"Z\") {\n\t\tresult.add(\"S\", \"X\")\n\t\tif input.Contains(index+1, 1, \"Z\") {\n\t\t\tindex += 2\n\t\t} else {\n\t\t\tindex++\n\t\t}\n\t} else if input.Contains(index, 2, \"SC\") {\n\t\tindex = handleSC(input, result, index)\n\t} else {\n\t\tif index == len(input)-1 &&\n\t\t\tinput.Contains(index-2, 2, \"AI\", \"OI\") {\n\t\t\tresult.add(\"\", \"S\")\n\t\t} else {\n\t\t\tresult.add(\"S\", \"S\")\n\t\t}\n\n\t\tif input.Contains(index+1, 1, \"S\", \"Z\") {\n\t\t\tindex += 2\n\t\t} else {\n\t\t\tindex++\n\t\t}\n\t}\n\treturn index\n}\n\nfunc handleSC(input runestring, result *metaphoneresult, index int) int {\n\tif input.SafeAt(index+2) == 'H' {\n\t\tif input.Contains(index+3, 2, \"OO\", \"ER\", \"EN\", \"UY\", \"ED\", \"EM\") {\n\t\t\tif input.Contains(index+3, 2, \"ER\", \"EN\") {\n\t\t\t\tresult.add(\"X\", \"SK\")\n\t\t\t} else {\n\t\t\t\tresult.add(\"SK\", \"SK\")\n\t\t\t}\n\t\t} else {\n\t\t\tif index == 0 && !isVowel(input.SafeAt(3)) && input.SafeAt(3) != 'W' {\n\t\t\t\tresult.add(\"X\", \"S\")\n\t\t\t} else {\n\t\t\t\tresult.add(\"X\", \"X\")\n\t\t\t}\n\t\t}\n\t} else if input.Contains(index+2, 1, \"I\", \"E\", \"Y\") {\n\t\tresult.add(\"S\", \"S\")\n\t} else {\n\t\tresult.add(\"SK\", \"SK\")\n\t}\n\tindex += 3\n\n\treturn index\n}\n\nfunc handleT(input runestring, result *metaphoneresult, index int) int {\n\tif input.Contains(index, 4, \"TION\") {\n\t\tresult.add(\"X\", \"X\")\n\t\tindex += 3\n\t} else if input.Contains(index, 3, \"TIA\", \"TCH\") {\n\t\tresult.add(\"X\", \"X\")\n\t\tindex += 3\n\t} else if input.Contains(index, 2, \"TH\") || input.Contains(index, 3, \"TTH\") {\n\t\tif input.Contains(index+2, 2, \"OM\", \"AM\") ||\n\t\t\tinput.Contains(0, 4, \"VAN \", \"VON \") ||\n\t\t\tinput.Contains(0, 3, \"SCH\") {\n\t\t\tresult.add(\"T\", \"T\")\n\t\t} else {\n\t\t\tresult.add(\"0\", \"T\")\n\t\t}\n\t\tindex += 2\n\t} else {\n\t\tresult.add(\"T\", \"T\")\n\t\tif input.Contains(index+1, 1, \"T\", \"D\") {\n\t\t\tindex += 2\n\t\t} else {\n\t\t\tindex++\n\t\t}\n\t}\n\treturn index\n}\n\nfunc handleW(input runestring, result *metaphoneresult, index int) int {\n\tif input.Contains(index, 2, \"WR\") {\n\t\tresult.add(\"R\", \"R\")\n\t\tindex += 2\n\t} else {\n\t\tif index == 0 && (isVowel(input.SafeAt(index+1)) ||\n\t\t\tinput.Contains(index, 2, \"WH\")) {\n\t\t\tif isVowel(input.SafeAt(index + 1)) {\n\t\t\t\tresult.add(\"A\", \"F\")\n\t\t\t} else {\n\t\t\t\tresult.add(\"A\", \"A\")\n\t\t\t}\n\t\t\tindex++\n\t\t} else if (index == len(input)-1 && isVowel(input.SafeAt(index-1))) ||\n\t\t\tinput.Contains(index-1, 5, \"EWSKI\", \"EWSKY\", \"OWSKI\", \"OWSKY\") ||\n\t\t\tinput.Contains(0, 3, \"SCH\") {\n\t\t\tresult.add(\"\", \"F\")\n\t\t\tindex++\n\t\t} else if input.Contains(index, 4, \"WICZ\", \"WITZ\") {\n\t\t\tresult.add(\"TS\", \"FX\")\n\t\t\tindex += 4\n\t\t} else {\n\t\t\tindex++\n\t\t}\n\t}\n\treturn index\n}\n\nfunc handleX(input runestring, result *metaphoneresult, index int) int {\n\tif index == 0 {\n\t\tresult.add(\"S\", \"S\")\n\t\tindex++\n\t} else {\n\t\tif !((index == len(input)-1) &&\n\t\t\t(input.Contains(index-3, 3, \"IAU\", \"EAU\") ||\n\t\t\t\tinput.Contains(index-2, 2, \"AU\", \"OU\"))) {\n\t\t\tresult.add(\"KS\", \"KS\")\n\t\t}\n\n\t\tif input.Contains(index+1, 1, \"C\", \"X\") {\n\t\t\tindex += 2\n\t\t} else {\n\t\t\tindex++\n\t\t}\n\t}\n\treturn index\n}\n\nfunc handleZ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {\n\tif input.SafeAt(index+1) == 'H' {\n\t\tresult.add(\"J\", \"J\")\n\t} else {\n\t\tif input.Contains(index+1, 2, \"ZO\", \"ZI\", \"ZA\") ||\n\t\t\t(slavoGermanic && (index > 0 && input.SafeAt(index-1) != 'T')) {\n\t\t\tresult.add(\"S\", \"TS\")\n\t\t} else {\n\t\t\tresult.add(\"S\", \"S\")\n\t\t}\n\t}\n\n\tif input.SafeAt(index+1) == 'Z' {\n\t\tindex += 2\n\t} else {\n\t\tindex++\n\t}\n\treturn index\n}\n\n/******************************************************************************\n * Complex conditional handlers for letters\n *****************************************************************************/\nfunc conditionC0(input runestring, index int) bool {\n\tif input.Contains(index, 4, \"CHIA\") {\n\t\treturn true\n\t} else if index <= 1 {\n\t\treturn false\n\t} else if isVowel(input.SafeAt(index - 2)) {\n\t\treturn false\n\t} else if !input.Contains(index-1, 3, \"ACH\") {\n\t\treturn false\n\t} else {\n\t\tc := input.SafeAt(index + 2)\n\t\treturn (c != 'I' && c != 'E') ||\n\t\t\t(input.Contains(index-2, 6, \"BACHER\") ||\n\t\t\t\tinput.Contains(index-2, 6, \"MACHER\"))\n\t}\n}\n\nfunc conditionCH0(input runestring, index int) bool {\n\tif index != 0 {\n\t\treturn false\n\t} else if !input.Contains(index+1, 5, \"HARAC\", \"HARIS\") &&\n\t\t!input.Contains(index+1, 3, \"HOR\", \"HYM\", \"HIA\", \"HEM\") {\n\t\treturn false\n\t} else if input.Contains(0, 5, \"CHORE\") {\n\t\treturn false\n\t} else {\n\t\treturn true\n\t}\n}\n\nfunc conditionCH1(input runestring, index int) bool {\n\t// good god this is ugly\n\treturn (input.Contains(0, 4, \"VAN \", \"VON \") || input.Contains(0, 3, \"SCH\")) ||\n\t\tinput.Contains(index-2, 6, \"ORCHES\", \"ARCHIT\", \"ORCHID\") ||\n\t\tinput.Contains(index+2, 1, \"T\", \"S\") ||\n\t\t((input.Contains(index-1, 1, \"A\", \"O\", \"U\", \"E\") || index == 0) &&\n\t\t\t(input.Contains(index+2, 1, \"L\", \"R\", \"N\", \"M\", \"B\", \"H\", \"F\", \"V\", \"W\", \" \") ||\n\t\t\t\tindex+1 == len(input)-1))\n}\n\nfunc conditionL0(input runestring, index int) bool {\n\tif index == (len(input)-3) &&\n\t\tinput.Contains(index-1, 4, \"ILLO\", \"ILLA\", \"ALLE\") {\n\t\treturn true\n\t} else if (input.Contains(len(input)-2, 2, \"AS\", \"OS\") ||\n\t\tinput.Contains(len(input)-1, 1, \"A\", \"O\")) &&\n\t\t(input.Contains(index-1, 4, \"ALLE\")) {\n\t\treturn true\n\t} else {\n\t\treturn false\n\t}\n}\n\nfunc conditionM0(input runestring, index int) bool {\n\tif input.SafeAt(index+1) == 'M' {\n\t\treturn true\n\t}\n\n\treturn input.Contains(index-1, 3, \"UMB\") &&\n\t\t((index+1) == (len(input)-1) ||\n\t\t\tinput.Contains(index+2, 2, \"ER\"))\n}\n\n// DoubleMetaphone computes the Double-Metaphone value of the input string.\n// This value is a phonetic representation of how the string sounds, with\n// affordances for many different language dialects. It was originally\n// developed by Lawrence Phillips in the 1990s.\n//\n// More information about this algorithm can be found on Wikipedia at\n// http://en.wikipedia.org/wiki/Metaphone.\nfunc DoubleMetaphone(s1 string) (string, string) {\n\t// trim, upper space\n\ts1 = cleanInput(s1)\n\n\t// structure to traverse the string by code point, not byte\n\tinput := runestring(s1)\n\n\tslavoGermanic := isSlavoGermanic(s1)\n\n\t// where we are in the string\n\tindex := 0\n\n\tif isSilentStart(input) {\n\t\tindex += 1\n\t}\n\n\tresult := newMetaphoneresult(4, true)\n\n\tfor !result.isComplete() && index <= len(input)-1 {\n\t\tc := rune(input.SafeAt(index))\n\t\tswitch c {\n\t\tcase 'A', 'E', 'I', 'O', 'U', 'Y':\n\t\t\tindex = handleVowel(result, index)\n\t\tcase 'B':\n\t\t\tresult.add(\"P\", \"P\")\n\t\t\tif input.SafeAt(index+1) == 'B' {\n\t\t\t\tindex += 2\n\t\t\t} else {\n\t\t\t\tindex++\n\t\t\t}\n\t\tcase 'Ç':\n\t\t\tresult.add(\"S\", \"S\")\n\t\t\tindex++\n\t\tcase 'C':\n\t\t\tindex = handleC(input, result, index)\n\t\tcase 'D':\n\t\t\tindex = handleD(input, result, index)\n\t\tcase 'F':\n\t\t\tresult.add(\"F\", \"F\")\n\t\t\tif input.SafeAt(index+1) == 'F' {\n\t\t\t\tindex += 2\n\t\t\t} else {\n\t\t\t\tindex++\n\t\t\t}\n\t\tcase 'G':\n\t\t\tindex = handleG(input, result, index, slavoGermanic)\n\t\tcase 'H':\n\t\t\tindex = handleH(input, result, index)\n\t\tcase 'J':\n\t\t\tindex = handleJ(input, result, index, slavoGermanic)\n\t\tcase 'K':\n\t\t\tresult.add(\"K\", \"K\")\n\t\t\tif input.SafeAt(index+1) == 'K' {\n\t\t\t\tindex += 2\n\t\t\t} else {\n\t\t\t\tindex++\n\t\t\t}\n\t\tcase 'L':\n\t\t\tindex = handleL(input, result, index)\n\t\tcase 'M':\n\t\t\tresult.add(\"M\", \"M\")\n\t\t\tif conditionM0(input, index) {\n\t\t\t\tindex += 2\n\t\t\t} else {\n\t\t\t\tindex++\n\t\t\t}\n\t\tcase 'N':\n\t\t\tresult.add(\"N\", \"N\")\n\t\t\tif input.SafeAt(index+1) == 'N' {\n\t\t\t\tindex += 2\n\t\t\t} else {\n\t\t\t\tindex++\n\t\t\t}\n\t\tcase 'Ñ':\n\t\t\tresult.add(\"N\", \"N\")\n\t\t\tindex++\n\t\tcase 'P':\n\t\t\tindex = handleP(input, result, index)\n\t\tcase 'Q':\n\t\t\tresult.add(\"K\", \"K\")\n\t\t\tif input.SafeAt(index+1) == 'Q' {\n\t\t\t\tindex += 2\n\t\t\t} else {\n\t\t\t\tindex++\n\t\t\t}\n\t\tcase 'R':\n\t\t\tindex = handleR(input, result, index, slavoGermanic)\n\t\tcase 'S':\n\t\t\tindex = handleS(input, result, index, slavoGermanic)\n\t\tcase 'T':\n\t\t\tindex = handleT(input, result, index)\n\t\tcase 'V':\n\t\t\tresult.add(\"F\", \"F\")\n\t\t\tif input.SafeAt(index+1) == 'V' {\n\t\t\t\tindex += 2\n\t\t\t} else {\n\t\t\t\tindex++\n\t\t\t}\n\t\tcase 'W':\n\t\t\tindex = handleW(input, result, index)\n\t\tcase 'X':\n\t\t\tindex = handleX(input, result, index)\n\t\tcase 'Z':\n\t\t\tindex = handleZ(input, result, index, slavoGermanic)\n\t\tdefault:\n\t\t\tindex++\n\t\t}\n\n\t}\n\n\treturn result.result()\n}\n"
  },
  {
    "path": "metaphone_test.go",
    "content": "package matchr\n\nimport (\n\t\"bufio\"\n\t\"compress/gzip\"\n\t\"os\"\n\t\"strings\"\n\t\"testing\"\n)\n\nfunc TestDoubleMetaphone(t *testing.T) {\n\t// load gzipped corpus\n\tf, err := os.Open(\"double_metaphone_corpus.txt.gz\")\n\tif err != nil {\n\t\tpanic(\"Error opening file double_metaphone_corpus.txt.gz! Exiting.\")\n\t}\n\tdefer f.Close()\n\n\tg, err := gzip.NewReader(f)\n\tif err != nil {\n\t\tpanic(\"Error with supposedly gzipped file double_metaphone_corpus.txt.gz! Exiting.\")\n\t}\n\n\tr := bufio.NewReader(g)\n\n\tline, err := r.ReadString('\\n')\n\tfor err == nil {\n\t\tline = strings.TrimRight(line, \"\\n\")\n\t\tv := strings.Split(line, \"|\")\n\n\t\tmetaphone, alternate := DoubleMetaphone(v[0])\n\t\tif metaphone != v[1] || alternate != v[2] {\n\t\t\tt.Errorf(\"DoubleMetaphone('%s') = (%v, %v), want (%v, %v)\", v[0], metaphone, alternate, v[1], v[2])\n\t\t\tt.FailNow()\n\t\t}\n\n\t\tline, err = r.ReadString('\\n')\n\t}\n}\n"
  },
  {
    "path": "nysiis.go",
    "content": "package matchr\n\n// NYSIIS computes the NYSIIS phonetic encoding of the input string. It is a\n// modification of the traditional Soundex algorithm.\nfunc NYSIIS(s1 string) string {\n\tcleans1 := runestring(cleanInput(s1))\n\tinput := runestring(make([]rune, 0, len(s1)))\n\n\t// The output can't be larger than the string itself\n\toutput := runestring(make([]rune, 0, len(s1)))\n\n\t// 0. Remove all non-ASCII characters\n\tfor _, v := range cleans1 {\n\t\tif v >= 65 && v <= 90 {\n\t\t\tinput = append(input, v)\n\t\t}\n\t}\n\n\tif len(input) == 0 {\n\t\treturn \"\"\n\t}\n\n\t// 1. Transcoding first characters\n\tswitch input[0] {\n\tcase 'M':\n\t\tif input.SafeSubstr(0, 3) == \"MAC\" {\n\t\t\t// MAC -> MCC\n\t\t\tinput[1] = 'C'\n\t\t}\n\tcase 'K':\n\t\tif input.SafeSubstr(0, 2) == \"KN\" {\n\t\t\t// KN -> NN\n\t\t\tinput[0] = 'N'\n\t\t} else {\n\t\t\t// K -> C\n\t\t\tinput[0] = 'C'\n\t\t}\n\tcase 'P':\n\t\tnext := input.SafeAt(1)\n\t\tif next == 'H' {\n\t\t\t// PH -> FF\n\t\t\tinput[0] = 'F'\n\t\t\tinput[1] = 'F'\n\t\t} else if next == 'F' {\n\t\t\t// PF -> FF\n\t\t\tinput[0] = 'F'\n\t\t}\n\tcase 'S':\n\t\tif input.SafeSubstr(0, 3) == \"SCH\" {\n\t\t\tinput[1] = 'S'\n\t\t\tinput[2] = 'S'\n\t\t}\n\t}\n\n\t// 2. Transcoding last characters\n\tswitch input.SafeSubstr(len(input)-2, 2) {\n\tcase \"EE\", \"IE\":\n\t\t// EE, IE -> Y\n\t\tinput.Del(len(input) - 2)\n\t\tinput[len(input)-1] = 'Y'\n\tcase \"DT\", \"RT\", \"RD\", \"NT\", \"ND\":\n\t\t// DT, RT, RD, NT, ND -> D\n\t\tinput.Del(len(input) - 2)\n\t\tinput[len(input)-1] = 'D'\n\t}\n\n\t// 3. First character of key = first character of name\n\toutput = append(output, input[0])\n\tlast := input[0]\n\n\tfor i := 1; i < len(input); i++ {\n\t\tc := input[i]\n\t\tswitch c {\n\t\tcase 'A', 'I', 'O', 'U':\n\t\t\t// A, E, I, O, U -> A (E is separate)\n\t\t\tinput[i] = 'A'\n\t\tcase 'E':\n\t\t\t// EV -> AF, else A\n\t\t\tif input.SafeAt(i+1) == 'V' {\n\t\t\t\tinput[i+1] = 'F'\n\t\t\t}\n\t\t\tinput[i] = 'A'\n\t\tcase 'Q':\n\t\t\t// Q -> G\n\t\t\tinput[i] = 'G'\n\t\tcase 'Z':\n\t\t\t// Z -> S\n\t\t\tinput[i] = 'S'\n\t\tcase 'M':\n\t\t\t// M -> N\n\t\t\tinput[i] = 'N'\n\t\tcase 'K':\n\t\t\t// KN -> N, else K -> C\n\t\t\tif input.SafeAt(i+1) == 'N' {\n\t\t\t\tinput.Del(i)\n\t\t\t} else {\n\t\t\t\tinput[i] = 'C'\n\t\t\t}\n\t\tcase 'S':\n\t\t\t// SCH -> SSS\n\t\t\tif input.SafeSubstr(i, 3) == \"SCH\" {\n\t\t\t\tinput[i+1] = 'S'\n\t\t\t\tinput[i+2] = 'S'\n\t\t\t}\n\t\tcase 'P':\n\t\t\t// PH -> FF\n\t\t\tif input.SafeAt(i+1) == 'H' {\n\t\t\t\tinput[i] = 'F'\n\t\t\t\tinput[i+1] = 'F'\n\t\t\t}\n\t\tcase 'H':\n\t\t\t// H -> $(previous character) if previous character or\n\t\t\t// next character is a non-vowel\n\t\t\tprev := input.SafeAt(i - 1)\n\t\t\tnext := input.SafeAt(i + 1)\n\t\t\tif !isVowelNoY(prev) || !isVowelNoY(next) {\n\t\t\t\tinput[i] = prev\n\t\t\t}\n\t\tcase 'W':\n\t\t\tprev := input.SafeAt(i - 1)\n\t\t\tif isVowelNoY(prev) {\n\t\t\t\tinput[i] = prev\n\t\t\t}\n\t\t}\n\n\t\tif input[i] != last && input[i] != 0 {\n\t\t\toutput = append(output, input[i])\n\t\t}\n\t\tlast = input[i]\n\t}\n\n\t// have to be careful here because we've already added the first\n\t// key value\n\tif len(output) > 1 {\n\t\t// remove trailing s\n\t\tif output.SafeAt(len(output)-1) == 'S' {\n\t\t\toutput.Del(len(output) - 1)\n\t\t}\n\n\t\t// trailing AY -> Y\n\t\tif len(output) > 2 && output.SafeSubstr(len(output)-2, 2) == \"AY\" {\n\t\t\toutput.Del(len(output) - 2)\n\t\t}\n\n\t\t// trailing A -> remove it\n\t\tif output.SafeAt(len(output)-1) == 'A' {\n\t\t\toutput.Del(len(output) - 1)\n\t\t}\n\t}\n\n\tif len(output) > 6 {\n\t\treturn string(output[0:6])\n\t} else {\n\t\treturn string(output)\n\t}\n}\n"
  },
  {
    "path": "nysiis_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\nvar nysiistests = []struct {\n\ts1     string\n\tnysiis string\n}{\n\t{\"knight\", \"NAGT\"},\n\t{\"mitchell\", \"MATCAL\"},\n\t{\"o'daniel\", \"ODANAL\"},\n\t{\"brown sr\", \"BRANSR\"},\n\t{\"browne III\", \"BRAN\"},\n\t{\"browne IV\", \"BRANAV\"},\n\t{\"O'Banion\", \"OBANAN\"},\n\t{\"Mclaughlin\", \"MCLAGL\"},\n\t{\"McCormack\", \"MCARNA\"},\n\t{\"Chapman\", \"CAPNAN\"},\n\t{\"Silva\", \"SALV\"},\n\t{\"McDonald\", \"MCDANA\"},\n\t{\"Lawson\", \"LASAN\"},\n\t{\"Jacobs\", \"JACAB\"},\n\t{\"Greene\", \"GRAN\"},\n\t{\"O'Brien\", \"OBRAN\"},\n\t{\"Morrison\", \"MARASA\"},\n\t{\"Larson\", \"LARSAN\"},\n\t{\"Willis\", \"WAL\"},\n\t{\"Mackenzie\", \"MCANSY\"},\n\t{\"Carr\", \"CAR\"},\n\t{\"Lawrence\", \"LARANC\"},\n\t{\"Matthews\", \"MAT\"},\n\t{\"Richards\", \"RACARD\"},\n\t{\"Bishop\", \"BASAP\"},\n\t{\"Franklin\", \"FRANCL\"},\n\t{\"McDaniel\", \"MCDANA\"},\n\t{\"Harper\", \"HARPAR\"},\n\t{\"Lynch\", \"LYNC\"},\n\t{\"Watkins\", \"WATCAN\"},\n\t{\"Carlson\", \"CARLSA\"},\n\t{\"Wheeler\", \"WALAR\"},\n\t{\"Louis XVI\", \"LASXV\"},\n\t{\"2002\", \"\"},\n\t{\"1/2\", \"\"},\n\t{\"\", \"\"},\n}\n\n// NYSIIS\nfunc TestNYIIS(t *testing.T) {\n\tfor _, tt := range nysiistests {\n\t\tnysiis := NYSIIS(tt.s1)\n\t\tif nysiis != tt.nysiis {\n\t\t\tt.Errorf(\"NYSIIS('%s') = %v, want %v\", tt.s1, nysiis, tt.nysiis)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "osa.go",
    "content": "package matchr\n\n// OSA computes the Optimal String Alignment distance between two\n// strings. The returned value - distance - is the number of insertions,\n// deletions, substitutions, and transpositions it takes to transform one\n// string (s1) into another (s2). Each step in the transformation \"costs\"\n// one distance point. It is similar to Damerau-Levenshtein, but is simpler\n// because it does not allow multiple edits on any substring.\nfunc OSA(s1 string, s2 string) (distance int) {\n\t// index by code point, not byte\n\tr1 := []rune(s1)\n\tr2 := []rune(s2)\n\n\trows := len(r1) + 1\n\tcols := len(r2) + 1\n\n\tvar i, j, d1, d2, d3, d_now, cost int\n\n\tdist := make([]int, rows*cols)\n\n\tfor i = 0; i < rows; i++ {\n\t\tdist[i*cols] = i\n\t}\n\n\tfor j = 0; j < cols; j++ {\n\t\tdist[j] = j\n\t}\n\n\tfor i = 1; i < rows; i++ {\n\t\tfor j = 1; j < cols; j++ {\n\t\t\tif r1[i-1] == r2[j-1] {\n\t\t\t\tcost = 0\n\t\t\t} else {\n\t\t\t\tcost = 1\n\t\t\t}\n\n\t\t\td1 = dist[((i-1)*cols)+j] + 1\n\t\t\td2 = dist[(i*cols)+(j-1)] + 1\n\t\t\td3 = dist[((i-1)*cols)+(j-1)] + cost\n\n\t\t\td_now = min(d1, min(d2, d3))\n\n\t\t\tif i > 2 && j > 2 && r1[i-1] == r2[j-2] &&\n\t\t\t\tr1[i-2] == r2[j-1] {\n\t\t\t\td1 = dist[((i-2)*cols)+(j-2)] + cost\n\t\t\t\td_now = min(d_now, d1)\n\t\t\t}\n\n\t\t\tdist[(i*cols)+j] = d_now\n\t\t}\n\t}\n\n\tdistance = dist[(cols*rows)-1]\n\n\treturn\n}\n"
  },
  {
    "path": "osa_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\nvar osatests = []struct {\n\ts1   string\n\ts2   string\n\tdist int\n}{\n\t// insertion\n\t{\"car\", \"cars\", 1},\n\t// substitution\n\t{\"library\", \"librari\", 1},\n\t// deletion\n\t{\"library\", \"librar\", 1},\n\t// transposition\n\t{\"library\", \"librayr\", 1},\n\t// one empty, left\n\t{\"\", \"library\", 7},\n\t// one empty, right\n\t{\"library\", \"\", 7},\n\t// two empties\n\t{\"\", \"\", 0},\n\t// unicode stuff!\n\t{\"Schüßler\", \"Schübler\", 1},\n\t{\"Schüßler\", \"Schußler\", 1},\n\t{\"Schüßler\", \"Schüßler\", 0},\n\t{\"Schßüler\", \"Schüßler\", 1},\n\t{\"Schüßler\", \"Schüler\", 1},\n\t{\"Schüßler\", \"Schüßlers\", 1},\n\t// difference between DL and OSA. This is OSA, so it should be 3.\n\t{\"ca\", \"abc\", 3},\n}\n\n// OSA (Optimal String Alignment)\nfunc TestOSA(t *testing.T) {\n\tfor _, tt := range osatests {\n\t\tdist := OSA(tt.s1, tt.s2)\n\t\tif dist != tt.dist {\n\t\t\tt.Errorf(\"OSA('%s', '%s') = %v, want %v\", tt.s1, tt.s2, dist, tt.dist)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "phonex.go",
    "content": "package matchr\n\nfunc preProcess(input []rune) []rune {\n\toutput := runestring(make([]rune, 0, len(input)))\n\n\t// 0. Remove all non-ASCII characters\n\tfor _, v := range input {\n\t\tif v >= 65 && v <= 90 {\n\t\t\toutput = append(output, v)\n\t\t}\n\t}\n\n\t// 1. Remove all trailing 'S' characters at the end of the name\n\tfor i := len(output) - 1; i >= 0 && output[i] == 'S'; i-- {\n\t\toutput.Del(i)\n\t}\n\n\t// 2. Convert leading letter pairs as follows\n\t//    KN -> N, PH -> F, WR -> R\n\tswitch output.SafeSubstr(0, 2) {\n\tcase \"KN\":\n\t\toutput = output[1:]\n\tcase \"PH\":\n\t\toutput[0] = 'F' // H will be ignored anyway\n\tcase \"WR\":\n\t\toutput = output[1:]\n\t}\n\n\t// 3a. Convert leading single letters as follows:\n\t//    H         -> Remove\n\tif output.SafeAt(0) == 'H' {\n\t\toutput = output[1:]\n\t}\n\n\t// 3a. Convert leading single letters as follows:\n\t//    E,I,O,U,Y -> A\n\t//    P         -> B\n\t//    V         -> F\n\t//    K,Q       -> C\n\t//    J         -> G\n\t//    Z         -> S\n\tswitch output.SafeAt(0) {\n\tcase 'E', 'I', 'O', 'U', 'Y':\n\t\toutput[0] = 'A'\n\tcase 'P':\n\t\toutput[0] = 'B'\n\tcase 'V':\n\t\toutput[0] = 'F'\n\tcase 'K', 'Q':\n\t\toutput[0] = 'C'\n\tcase 'J':\n\t\toutput[0] = 'G'\n\tcase 'Z':\n\t\toutput[0] = 'S'\n\t}\n\n\treturn output\n}\n\n// Phonex computes the Phonex phonetic encoding of the input string. Phonex is\n// a modification of the venerable Soundex algorithm. It accounts for a few\n// more letter combinations to improve accuracy on some data sets.\n//\n// This implementation is based off of the original C implementation by the\n// creator - A. J. Lait - as found in his research paper entitled \"An\n// Assessment of Name Matching Algorithms.\"\nfunc Phonex(s1 string) string {\n\n\t// preprocess\n\ts1 = cleanInput(s1)\n\n\tinput := runestring(preProcess([]rune(s1)))\n\n\tresult := make([]rune, 0, len(input))\n\n\tlast := rune(0)\n\tcode := rune(0)\n\tfor i := 0; i < len(input) &&\n\t\tinput[i] != ' ' &&\n\t\tinput[i] != ',' &&\n\t\tlen(result) < 4; i++ {\n\t\tswitch input[i] {\n\t\tcase 'B', 'P', 'F', 'V':\n\t\t\tcode = '1'\n\t\tcase 'C', 'S', 'K', 'G', 'J', 'Q', 'X', 'Z':\n\t\t\tcode = '2'\n\t\tcase 'D', 'T':\n\t\t\tif input.SafeAt(i+1) != 'C' {\n\t\t\t\tcode = '3'\n\t\t\t}\n\t\tcase 'L':\n\t\t\tif isVowel(input.SafeAt(i+1)) || i == len(input)-1 {\n\t\t\t\tcode = '4'\n\t\t\t}\n\t\tcase 'M', 'N':\n\t\t\tnextChar := input.SafeAt(i + 1)\n\t\t\tif nextChar == 'D' || nextChar == 'G' {\n\t\t\t\t// ignore next character\n\t\t\t\ti++\n\t\t\t}\n\t\t\tcode = '5'\n\t\tcase 'R':\n\t\t\tif isVowel(input.SafeAt(i+1)) || i == len(input)-1 {\n\t\t\t\tcode = '6'\n\t\t\t}\n\t\tdefault:\n\t\t\tcode = 0\n\t\t}\n\n\t\tif last != code && code != 0 && i != 0 {\n\t\t\tresult = append(result, code)\n\t\t}\n\n\t\t// special case for 1st character: we use the actual character\n\t\tif i == 0 {\n\t\t\tresult = append(result, input[i])\n\t\t\tlast = code\n\t\t} else {\n\t\t\tlast = result[len(result)-1]\n\t\t}\n\t}\n\n\tfor len(result) < 4 {\n\t\tresult = append(result, '0')\n\t}\n\n\treturn string(result)\n}\n"
  },
  {
    "path": "phonex_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\n// test cases from http://rosettacode.org/wiki/phonex#F.23\nvar phonextests = []struct {\n\ts1     string\n\tphonex string\n}{\n\t{\"123 testsss\", \"T230\"},\n\t{\"24/7 test\", \"T230\"},\n\t{\"A\", \"A000\"},\n\t{\"Lee\", \"L000\"},\n\t{\"Kuhne\", \"C500\"},\n\t{\"Meyer-Lansky\", \"M452\"},\n\t{\"Oepping\", \"A150\"},\n\t{\"Daley\", \"D400\"},\n\t{\"Dalitz\", \"D432\"},\n\t{\"Duhlitz\", \"D432\"},\n\t{\"Dull\", \"D400\"},\n\t{\"De Ledes\", \"D430\"},\n\t{\"Sandemann\", \"S500\"},\n\t{\"Schüßler\", \"S460\"},\n\t{\"Schmidt\", \"S530\"},\n\t{\"Sinatra\", \"S536\"},\n\t{\"Heinrich\", \"A562\"},\n\t{\"Hammerschlag\", \"A524\"},\n\t{\"Williams\", \"W450\"},\n\t{\"Wilms\", \"W500\"},\n\t{\"Wilson\", \"W250\"},\n\t{\"Worms\", \"W500\"},\n\t{\"Zedlitz\", \"S343\"},\n\t{\"Zotteldecke\", \"S320\"},\n\t{\"ZYX test\", \"S232\"},\n\t{\"Scherman\", \"S500\"},\n\t{\"Schurman\", \"S500\"},\n\t{\"Sherman\", \"S500\"},\n\t{\"Shermansss\", \"S500\"},\n\t{\"Shireman\", \"S650\"},\n\t{\"Shurman\", \"S500\"},\n\t{\"Euler\", \"A460\"},\n\t{\"Ellery\", \"A460\"},\n\t{\"Hilbert\", \"A130\"},\n\t{\"Heilbronn\", \"A165\"},\n\t{\"Gauss\", \"G000\"},\n\t{\"Ghosh\", \"G200\"},\n\t{\"Knuth\", \"N300\"},\n\t{\"Kant\", \"C530\"},\n\t{\"Lloyd\", \"L430\"},\n\t{\"Ladd\", \"L300\"},\n\t{\"Lukasiewicz\", \"L200\"},\n\t{\"Lissajous\", \"L200\"},\n\t{\"Ashcraft\", \"A261\"},\n\t{\"Philip\", \"F410\"},\n\t{\"Fripp\", \"F610\"},\n\t{\"Czarkowska\", \"C200\"},\n\t{\"Hornblower\", \"A514\"},\n\t{\"Looser\", \"L260\"},\n\t{\"Wright\", \"R230\"},\n\t{\"Phonic\", \"F520\"},\n\t{\"Quickening\", \"C250\"},\n\t{\"Kuickening\", \"C250\"},\n\t{\"Joben\", \"G150\"},\n\t{\"Zelda\", \"S300\"},\n\t{\"S\", \"0000\"},\n\t{\"H\", \"0000\"},\n\t{\"\", \"0000\"},\n}\n\n// phonex\nfunc TestPhonex(t *testing.T) {\n\tfor _, tt := range phonextests {\n\t\tphonex := Phonex(tt.s1)\n\t\tif phonex != tt.phonex {\n\t\t\tt.Errorf(\"Phonex('%s') = %v, want %v\", tt.s1, phonex, tt.phonex)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "runestring.go",
    "content": "package matchr\n\ntype runestring []rune\n\n// A safe way to index a runestring. It will return a null rune if you try\n// to index outside of the bounds of the runestring.\nfunc (r *runestring) SafeAt(pos int) rune {\n\tif pos < 0 || pos >= len(*r) {\n\t\treturn 0\n\t} else {\n\t\treturn (*r)[pos]\n\t}\n}\n\n// A safe way to obtain a substring of a runestring. It will return a null\n// string (\"\") if you index somewhere outside its bounds.\nfunc (r *runestring) SafeSubstr(pos int, length int) string {\n\tif pos < 0 || pos > len(*r) || (pos+length) > len(*r) {\n\t\treturn \"\"\n\t} else {\n\t\treturn string((*r)[pos : pos+length])\n\t}\n}\n\n// Delete characters at positions pos. It will do nothing if you provide\n// an index outside the bounds of the runestring.\nfunc (r *runestring) Del(pos ...int) {\n\tfor _, i := range pos {\n\t\tif i >= 0 && i <= len(*r) {\n\t\t\t*r = append((*r)[:i], (*r)[i+1:]...)\n\t\t}\n\t}\n}\n\n// A helper to determine if any substrings exist within the given runestring.\nfunc (r *runestring) Contains(start int, length int, criteria ...string) bool {\n\tsubstring := r.SafeSubstr(start, length)\n\tfor _, c := range criteria {\n\t\tif substring == c {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n"
  },
  {
    "path": "smithwaterman.go",
    "content": "package matchr\n\nconst GAP_COST = float64(0.5)\n\nfunc getCost(r1 []rune, r1Index int, r2 []rune, r2Index int) float64 {\n\tif r1[r1Index] == r2[r2Index] {\n\t\treturn 1.0\n\t} else {\n\t\treturn -2.0\n\t}\n}\n\n// SmithWaterman computes the Smith-Waterman local sequence alignment for the\n// two input strings. This was originally designed to find similar regions in\n// strings representing DNA or protein sequences.\nfunc SmithWaterman(s1 string, s2 string) float64 {\n\tvar cost float64\n\n\t// index by code point, not byte\n\tr1 := []rune(s1)\n\tr2 := []rune(s2)\n\n\tr1Len := len(r1)\n\tr2Len := len(r2)\n\n\tif r1Len == 0 {\n\t\treturn float64(r2Len)\n\t}\n\n\tif r2Len == 0 {\n\t\treturn float64(r1Len)\n\t}\n\n\td := make([][]float64, r1Len)\n\tfor i := range d {\n\t\td[i] = make([]float64, r2Len)\n\t}\n\n\tvar maxSoFar float64\n\tfor i := 0; i < r1Len; i++ {\n\t\t// substitution cost\n\t\tcost = getCost(r1, i, r2, 0)\n\t\tif i == 0 {\n\t\t\td[0][0] = max(0.0, max(-GAP_COST, cost))\n\t\t} else {\n\t\t\td[i][0] = max(0.0, max(d[i-1][0]-GAP_COST, cost))\n\t\t}\n\n\t\t// save if it is the biggest thus far\n\t\tif d[i][0] > maxSoFar {\n\t\t\tmaxSoFar = d[i][0]\n\t\t}\n\t}\n\n\tfor j := 0; j < r2Len; j++ {\n\t\t// substitution cost\n\t\tcost = getCost(r1, 0, r2, j)\n\t\tif j == 0 {\n\t\t\td[0][0] = max(0, max(-GAP_COST, cost))\n\t\t} else {\n\t\t\td[0][j] = max(0, max(d[0][j-1]-GAP_COST, cost))\n\t\t}\n\n\t\t// save if it is the biggest thus far\n\t\tif d[0][j] > maxSoFar {\n\t\t\tmaxSoFar = d[0][j]\n\t\t}\n\t}\n\n\tfor i := 1; i < r1Len; i++ {\n\t\tfor j := 1; j < r2Len; j++ {\n\t\t\tcost = getCost(r1, i, r2, j)\n\n\t\t\t// find the lowest cost\n\t\t\td[i][j] = max(\n\t\t\t\tmax(0, d[i-1][j]-GAP_COST),\n\t\t\t\tmax(d[i][j-1]-GAP_COST, d[i-1][j-1]+cost))\n\n\t\t\t// save if it is the biggest thus far\n\t\t\tif d[i][j] > maxSoFar {\n\t\t\t\tmaxSoFar = d[i][j]\n\t\t\t}\n\t\t}\n\t}\n\n\treturn maxSoFar\n}\n"
  },
  {
    "path": "smithwaterman_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\nvar swtests = []struct {\n\ts1   string\n\ts2   string\n\tdist float64\n}{\n\t// insertion\n\t{\"car\", \"cars\", 3.0},\n\t// substitution\n\t{\"library\", \"librari\", 6.0},\n\t// deletion\n\t{\"library\", \"librar\", 6.0},\n\t// transposition\n\t{\"library\", \"librayr\", 5.5},\n\t// one empty, left\n\t{\"\", \"library\", 7.0},\n\t// one empty, right\n\t{\"library\", \"\", 7.0},\n\t// two empties\n\t{\"\", \"\", 0.0},\n\t// unicode stuff!\n\t{\"Schüßler\", \"Schübler\", 6.0},\n\t{\"Ant Zucaro\", \"Anthony Zucaro\", 8.0},\n\t{\"Schüßler\", \"Schüßler\", 8.0},\n\t{\"Schßüler\", \"Schüßler\", 6.0},\n\t{\"Schüßler\", \"Schüler\", 6.5},\n\t{\"Schüßler\", \"Schüßlers\", 8.0},\n}\n\n// Smith-Waterman\nfunc TestSmithWaterman(t *testing.T) {\n\tfor _, tt := range swtests {\n\t\tdist := SmithWaterman(tt.s1, tt.s2)\n\t\tif dist != tt.dist {\n\t\t\tt.Errorf(\"SmithWaterman('%s', '%s') = %v, want %v\", tt.s1, tt.s2, dist, tt.dist)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "soundex.go",
    "content": "package matchr\n\nimport \"strings\"\n\n// Soundex computes the Soundex phonetic representation of the input string. It\n// attempts to encode homophones with the same characters. More information can\n// be found at http://en.wikipedia.org/wiki/Soundex.\nfunc Soundex(s1 string) string {\n\tif len(s1) == 0 {\n\t\treturn \"\"\n\t}\n\n\t// we should work with all uppercase\n\ts1 = strings.ToUpper(s1)\n\n\tinput := NewString(s1)\n\n\t// the encoded value\n\tenc := input.Slice(0, 1)\n\n\tc := \"\"\n\tprev := \"\"\n\thw := false\n\n\tfor i := 0; i < input.RuneCount(); i++ {\n\t\tswitch rune(input.At(i)) {\n\t\tcase 'B', 'F', 'P', 'V':\n\t\t\tc = \"1\"\n\t\tcase 'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z':\n\t\t\tc = \"2\"\n\t\tcase 'D', 'T':\n\t\t\tc = \"3\"\n\t\tcase 'L':\n\t\t\tc = \"4\"\n\t\tcase 'M', 'N':\n\t\t\tc = \"5\"\n\t\tcase 'R':\n\t\t\tc = \"6\"\n\t\tcase 'H', 'W':\n\t\t\thw = true\n\t\tdefault:\n\t\t\tc = \"\"\n\t\t}\n\n\t\t// don't encode the first position, but we need its code value\n\t\t// to prevent repeats\n\t\tif c != \"\" && c != prev && i > 0 {\n\t\t\t// if the next encoded digit is different, we can add it right away\n\t\t\t// if it is the same, though, it must not have been preceded\n\t\t\t// by an 'H' or a 'W'\n\t\t\tif enc[len(enc)-1:len(enc)] != c || !hw {\n\t\t\t\tenc = enc + c\n\t\t\t}\n\n\t\t\t// we're done when we reach four encoded characters\n\t\t\tif len(enc) == 4 {\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\n\t\tprev = c\n\t\thw = false\n\t}\n\n\t// if we've fallen short of 4 \"real\" encoded characters,\n\t// it gets padded with zeros\n\tfor len(enc) < 4 {\n\t\tenc = enc + \"0\"\n\t}\n\n\treturn enc\n}\n"
  },
  {
    "path": "soundex_test.go",
    "content": "package matchr\n\nimport \"testing\"\n\n// test cases from http://rosettacode.org/wiki/Soundex#F.23\nvar soundextests = []struct {\n\ts1      string\n\tsoundex string\n}{\n\t{\"Ashcraft\", \"A261\"},\n\t{\"Ashhhcraft\", \"A261\"},\n\t{\"Ashcroft\", \"A261\"},\n\t{\"Burroughs\", \"B620\"},\n\t{\"Burrows\", \"B620\"},\n\t{\"Ekzampul\", \"E251\"},\n\t{\"Example\", \"E251\"},\n\t{\"Ellery\", \"E460\"},\n\t{\"Euler\", \"E460\"},\n\t{\"Ghosh\", \"G200\"},\n\t{\"Gauss\", \"G200\"},\n\t{\"Gutierrez\", \"G362\"},\n\t{\"Heilbronn\", \"H416\"},\n\t{\"Hilbert\", \"H416\"},\n\t{\"Jackson\", \"J250\"},\n\t{\"Kant\", \"K530\"},\n\t{\"Knuth\", \"K530\"},\n\t{\"Lee\", \"L000\"},\n\t{\"Lukasiewicz\", \"L222\"},\n\t{\"Lissajous\", \"L222\"},\n\t{\"Ladd\", \"L300\"},\n\t{\"Lloyd\", \"L300\"},\n\t{\"Moses\", \"M220\"},\n\t{\"O'Hara\", \"O600\"},\n\t{\"Pfister\", \"P236\"},\n\t{\"Rubin\", \"R150\"},\n\t{\"Robert\", \"R163\"},\n\t{\"Rupert\", \"R163\"},\n\t{\"Soundex\", \"S532\"},\n\t{\"Sownteks\", \"S532\"},\n\t{\"Tymczak\", \"T522\"},\n\t{\"VanDeusen\", \"V532\"},\n\t{\"Washington\", \"W252\"},\n\t{\"Wheaton\", \"W350\"},\n}\n\n// Soundex\nfunc TestSoundex(t *testing.T) {\n\tfor _, tt := range soundextests {\n\t\tsoundex := Soundex(tt.s1)\n\t\tif soundex != tt.soundex {\n\t\t\tt.Errorf(\"Soundex('%s') = %v, want %v\", tt.s1, soundex, tt.soundex)\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "utf8.go",
    "content": "// Copyright 2009 The Go Authors. All rights reserved.\n// Use of this source code is governed by a BSD-style\n// license that can be found in the LICENSE file.\n\npackage matchr\n\nimport (\n\t\"errors\"\n\t\"unicode/utf8\"\n)\n\n// String wraps a regular string with a small structure that provides more\n// efficient indexing by code point index, as opposed to byte index.\n// Scanning incrementally forwards or backwards is O(1) per index operation\n// (although not as fast a range clause going forwards).  Random access is\n// O(N) in the length of the string, but the overhead is less than always\n// scanning from the beginning.\n// If the string is ASCII, random access is O(1).\n// Unlike the built-in string type, String has internal mutable state and\n// is not thread-safe.\ntype String struct {\n\tstr      string\n\tnumRunes int\n\t// If width > 0, the rune at runePos starts at bytePos and has the specified width.\n\twidth    int\n\tbytePos  int\n\trunePos  int\n\tnonASCII int // byte index of the first non-ASCII rune.\n}\n\n// NewString returns a new UTF-8 string with the provided contents.\nfunc NewString(contents string) *String {\n\treturn new(String).Init(contents)\n}\n\n// Init initializes an existing String to hold the provided contents.\n// It returns a pointer to the initialized String.\nfunc (s *String) Init(contents string) *String {\n\ts.str = contents\n\ts.bytePos = 0\n\ts.runePos = 0\n\tfor i := 0; i < len(contents); i++ {\n\t\tif contents[i] >= utf8.RuneSelf {\n\t\t\t// Not ASCII.\n\t\t\ts.numRunes = utf8.RuneCountInString(contents)\n\t\t\t_, s.width = utf8.DecodeRuneInString(contents)\n\t\t\ts.nonASCII = i\n\t\t\treturn s\n\t\t}\n\t}\n\t// ASCII is simple.  Also, the empty string is ASCII.\n\ts.numRunes = len(contents)\n\ts.width = 0\n\ts.nonASCII = len(contents)\n\treturn s\n}\n\n// String returns the contents of the String.  This method also means the\n// String is directly printable by fmt.Print.\nfunc (s *String) String() string {\n\treturn s.str\n}\n\n// RuneCount returns the number of runes (Unicode code points) in the String.\nfunc (s *String) RuneCount() int {\n\treturn s.numRunes\n}\n\n// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.\nfunc (s *String) IsASCII() bool {\n\treturn s.width == 0\n}\n\n// Slice returns the string sliced at rune positions [i:j].\nfunc (s *String) Slice(i, j int) string {\n\t// ASCII is easy.  Let the compiler catch the indexing error if there is one.\n\tif j < s.nonASCII {\n\t\treturn s.str[i:j]\n\t}\n\tif i < 0 || j > s.numRunes || i > j {\n\t\tpanic(errors.New(\"utf8.String: slice index out of range\"))\n\t}\n\tif i == j {\n\t\treturn \"\"\n\t}\n\t// For non-ASCII, after At(i), bytePos is always the position of the indexed character.\n\tvar low, high int\n\tswitch {\n\tcase i < s.nonASCII:\n\t\tlow = i\n\tcase i == s.numRunes:\n\t\tlow = len(s.str)\n\tdefault:\n\t\ts.At(i)\n\t\tlow = s.bytePos\n\t}\n\tswitch {\n\tcase j == s.numRunes:\n\t\thigh = len(s.str)\n\tdefault:\n\t\ts.At(j)\n\t\thigh = s.bytePos\n\t}\n\treturn s.str[low:high]\n}\n\n// At returns the rune with index i in the String.  The sequence of runes is the same\n// as iterating over the contents with a \"for range\" clause.\nfunc (s *String) At(i int) int {\n\t// ASCII is easy.  Let the compiler catch the indexing error if there is one.\n\tif i < s.nonASCII {\n\t\treturn int(s.str[i])\n\t}\n\n\t// Now we do need to know the index is valid.\n\tif i < 0 || i >= s.numRunes {\n\t\tpanic(errors.New(\"utf8.String: index out of range\"))\n\t}\n\n\tvar r rune\n\n\t// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.\n\t// With these cases, all scans from beginning or end work in O(1) time per rune.\n\tswitch {\n\n\tcase i == s.runePos-1: // backing up one rune\n\t\tr, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])\n\t\ts.runePos = i\n\t\ts.bytePos -= s.width\n\t\treturn int(r)\n\tcase i == s.runePos+1: // moving ahead one rune\n\t\ts.runePos = i\n\t\ts.bytePos += s.width\n\t\tfallthrough\n\tcase i == s.runePos:\n\t\tr, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])\n\t\treturn int(r)\n\tcase i == 0: // start of string\n\t\tr, s.width = utf8.DecodeRuneInString(s.str)\n\t\ts.runePos = 0\n\t\ts.bytePos = 0\n\t\treturn int(r)\n\n\tcase i == s.numRunes-1: // last rune in string\n\t\tr, s.width = utf8.DecodeLastRuneInString(s.str)\n\t\ts.runePos = i\n\t\ts.bytePos = len(s.str) - s.width\n\t\treturn int(r)\n\t}\n\n\t// We need to do a linear scan.  There are three places to start from:\n\t// 1) The beginning\n\t// 2) bytePos/runePos.\n\t// 3) The end\n\t// Choose the closest in rune count, scanning backwards if necessary.\n\tforward := true\n\tif i < s.runePos {\n\t\t// Between beginning and pos.  Which is closer?\n\t\t// Since both i and runePos are guaranteed >= nonASCII, that's the\n\t\t// lowest location we need to start from.\n\t\tif i < (s.runePos-s.nonASCII)/2 {\n\t\t\t// Scan forward from beginning\n\t\t\ts.bytePos, s.runePos = s.nonASCII, s.nonASCII\n\t\t} else {\n\t\t\t// Scan backwards from where we are\n\t\t\tforward = false\n\t\t}\n\t} else {\n\t\t// Between pos and end.  Which is closer?\n\t\tif i-s.runePos < (s.numRunes-s.runePos)/2 {\n\t\t\t// Scan forward from pos\n\t\t} else {\n\t\t\t// Scan backwards from end\n\t\t\ts.bytePos, s.runePos = len(s.str), s.numRunes\n\t\t\tforward = false\n\t\t}\n\t}\n\tif forward {\n\t\t// TODO: Is it much faster to use a range loop for this scan?\n\t\tfor {\n\t\t\tr, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])\n\t\t\tif s.runePos == i {\n\t\t\t\tbreak\n\t\t\t}\n\t\t\ts.runePos++\n\t\t\ts.bytePos += s.width\n\t\t}\n\t} else {\n\t\tfor {\n\t\t\tr, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])\n\t\t\ts.runePos--\n\t\t\ts.bytePos -= s.width\n\t\t\tif s.runePos == i {\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t}\n\treturn int(r)\n}\n\n// We want the panic in At(i) to satisfy os.Error, because that's what\n// runtime panics satisfy, but we can't import os.  This is our solution.\n\n// error is the type of the error returned if a user calls String.At(i) with i out of range.\n// It satisfies os.Error and runtime.Error.\n// type error string\n\n/*\nfunc (err error) String() string {\n\treturn string(err)\n}\n\nfunc (err error) RunTimeError() {\n}\n*/\n"
  },
  {
    "path": "util.go",
    "content": "package matchr\n\nimport (\n\t\"math\"\n\t\"strings\"\n)\n\n// min of two integers\nfunc min(a int, b int) (res int) {\n\tif a < b {\n\t\tres = a\n\t} else {\n\t\tres = b\n\t}\n\n\treturn\n}\n\n// max of two integers\nfunc maxI(a int, b int) (res int) {\n\tif a < b {\n\t\tres = b\n\t} else {\n\t\tres = a\n\t}\n\n\treturn\n}\n\n// max of two float64s\nfunc max(a float64, b float64) (res float64) {\n\tif a < b {\n\t\tres = b\n\t} else {\n\t\tres = a\n\t}\n\n\treturn\n}\n\n// is this string index outside of the ASCII numeric code points?\nfunc nan(c rune) bool {\n\treturn ((c > 57) || (c < 48))\n}\n\n// Round a float64 to the given precision\n//\n// http://play.golang.org/p/S654PxAe_N\n//\n// (via Rory McGuire at\n// https://groups.google.com/forum/#!topic/golang-nuts/ITZV08gAugI)\nfunc round(x float64, prec int) float64 {\n\tif math.IsNaN(x) || math.IsInf(x, 0) {\n\t\treturn x\n\t}\n\n\tsign := 1.0\n\tif x < 0 {\n\t\tsign = -1\n\t\tx *= -1\n\t}\n\n\tvar rounder float64\n\tpow := math.Pow(10, float64(prec))\n\tintermed := x * pow\n\t_, frac := math.Modf(intermed)\n\n\tif frac >= 0.5 {\n\t\trounder = math.Ceil(intermed)\n\t} else {\n\t\trounder = math.Floor(intermed)\n\t}\n\n\treturn rounder / pow * sign\n}\n\n// A helper to determine if any substrings exist within the given string\nfunc contains(value *String, start int, length int, criteria ...string) bool {\n\tsubstring := substring(value, start, length)\n\tfor _, c := range criteria {\n\t\tif substring == c {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\n// A fault-tolerant version of Slice. It will return nothing (\"\") if the index\n// is out of bounds. This allows substring-ing without having to bound check\n// every time.\nfunc substring(value *String, start int, length int) string {\n\tif start >= 0 && start+length <= value.RuneCount() {\n\t\treturn value.Slice(start, start+length)\n\t} else {\n\t\treturn \"\"\n\t}\n}\n\nfunc isVowel(c rune) bool {\n\tswitch c {\n\tcase 'A', 'E', 'I', 'O', 'U', 'Y':\n\t\treturn true\n\tdefault:\n\t\treturn false\n\t}\n}\n\nfunc isVowelNoY(c rune) bool {\n\tswitch c {\n\tcase 'A', 'E', 'I', 'O', 'U':\n\t\treturn true\n\tdefault:\n\t\treturn false\n\t}\n}\n\nfunc cleanInput(input string) string {\n\treturn strings.ToUpper(strings.TrimSpace(input))\n}\n"
  }
]