[
  {
    "path": ".travis.yml",
    "content": "language: go\ngo:\n    - 1.11\n    - 1.12\n    - 1.13\n    - 1.14.x\n    - master\nscript:\n    - cd tests && make\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright (C) 2016 Felipe da Cunha Gonçalves\nAll Rights Reserved.\n\nMIT LICENSE\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of\nthis software and associated documentation files (the \"Software\"), to deal in\nthe Software without restriction, including without limitation the rights to\nuse, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of\nthe Software, and to permit persons to whom the Software is furnished to do so,\nsubject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS\nFOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR\nCOPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER\nIN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN\nCONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "[![Build Status](https://travis-ci.org/xrash/smetrics.svg?branch=master)](http://travis-ci.org/xrash/smetrics)\n\n# smetrics\n\n`smetrics` is \"string metrics\".\n\nPackage smetrics provides a bunch of algorithms for calculating the distance between strings.\n\nThere are implementations for calculating the popular Levenshtein distance (aka Edit Distance or Wagner-Fischer), as well as the Jaro distance, the Jaro-Winkler distance, and more.\n\n# How to import\n\n```go\nimport \"github.com/xrash/smetrics\"\n```\n\n# Documentation\n\nGo to [https://pkg.go.dev/github.com/xrash/smetrics](https://pkg.go.dev/github.com/xrash/smetrics) for complete documentation.\n\n# Example\n\n```go\npackage main\n\nimport (\n\t\"github.com/xrash/smetrics\"\n)\n\nfunc main() {\n\tsmetrics.WagnerFischer(\"POTATO\", \"POTATTO\", 1, 1, 2)\n\tsmetrics.WagnerFischer(\"MOUSE\", \"HOUSE\", 2, 2, 4)\n\n\tsmetrics.Ukkonen(\"POTATO\", \"POTATTO\", 1, 1, 2)\n\tsmetrics.Ukkonen(\"MOUSE\", \"HOUSE\", 2, 2, 4)\n\n\tsmetrics.Jaro(\"AL\", \"AL\")\n\tsmetrics.Jaro(\"MARTHA\", \"MARHTA\")\n\n\tsmetrics.JaroWinkler(\"AL\", \"AL\", 0.7, 4)\n\tsmetrics.JaroWinkler(\"MARTHA\", \"MARHTA\", 0.7, 4)\n\n\tsmetrics.Soundex(\"Euler\")\n\tsmetrics.Soundex(\"Ellery\")\n\n\tsmetrics.Hamming(\"aaa\", \"aaa\")\n\tsmetrics.Hamming(\"aaa\", \"aab\")\n}\n```\n"
  },
  {
    "path": "doc.go",
    "content": "/*\nPackage smetrics provides a bunch of algorithms for calculating\nthe distance between strings.\n\nThere are implementations for calculating the popular Levenshtein\ndistance (aka Edit Distance or Wagner-Fischer), as well as the Jaro\ndistance, the Jaro-Winkler distance, and more.\n\nFor the Levenshtein distance, you can use the functions WagnerFischer()\nand Ukkonen(). Read the documentation on these functions.\n\nFor the Jaro and Jaro-Winkler algorithms, check the functions\nJaro() and JaroWinkler(). Read the documentation on these functions.\n\nFor the Soundex algorithm, check the function Soundex().\n\nFor the Hamming distance algorithm, check the function Hamming().\n*/\npackage smetrics\n"
  },
  {
    "path": "go.mod",
    "content": "module github.com/xrash/smetrics\n\ngo 1.15\n"
  },
  {
    "path": "hamming.go",
    "content": "package smetrics\n\nimport (\n\t\"fmt\"\n)\n\n// The Hamming distance is the minimum number of substitutions required to change string A into string B. Both strings must have the same size. If the strings have different sizes, the function returns an error.\nfunc Hamming(a, b string) (int, error) {\n\tal := len(a)\n\tbl := len(b)\n\n\tif al != bl {\n\t\treturn -1, fmt.Errorf(\"strings are not equal (len(a)=%d, len(b)=%d)\", al, bl)\n\t}\n\n\tvar difference = 0\n\n\tfor i := range a {\n\t\tif a[i] != b[i] {\n\t\t\tdifference = difference + 1\n\t\t}\n\t}\n\n\treturn difference, nil\n}\n"
  },
  {
    "path": "jaro-winkler.go",
    "content": "package smetrics\n\nimport (\n\t\"math\"\n)\n\n// The Jaro-Winkler distance. The result is 1 for equal strings, and 0 for completely different strings. It is commonly used on Record Linkage stuff, thus it tries to be accurate for common typos when writing real names such as  person names and street names.\n// Jaro-Winkler is a modification of the Jaro algorithm. It works by first running Jaro, then boosting the score of exact matches at the beginning of the strings. Because of that, it introduces two more parameters: the boostThreshold and the prefixSize. These are commonly set to 0.7 and 4, respectively.\nfunc JaroWinkler(a, b string, boostThreshold float64, prefixSize int) float64 {\n\tj := Jaro(a, b)\n\n\tif j <= boostThreshold {\n\t\treturn j\n\t}\n\n\tprefixSize = int(math.Min(float64(len(a)), math.Min(float64(prefixSize), float64(len(b)))))\n\n\tvar prefixMatch float64\n\tfor i := 0; i < prefixSize; i++ {\n\t\tif a[i] == b[i] {\n\t\t\tprefixMatch++\n\t\t} else {\n\t\t\tbreak\n\t\t}\n\t}\n\n\treturn j + 0.1*prefixMatch*(1.0-j)\n}\n"
  },
  {
    "path": "jaro.go",
    "content": "package smetrics\n\nimport (\n\t\"math\"\n)\n\n// The Jaro distance. The result is 1 for equal strings, and 0 for completely different strings.\nfunc Jaro(a, b string) float64 {\n\t// If both strings are zero-length, they are completely equal,\n\t// therefore return 1.\n\tif len(a) == 0 && len(b) == 0 {\n\t\treturn 1\n\t}\n\n\t// If one string is zero-length, strings are completely different,\n\t// therefore return 0.\n\tif len(a) == 0 || len(b) == 0 {\n\t\treturn 0\n\t}\n\n\t// Define the necessary variables for the algorithm.\n\tla := float64(len(a))\n\tlb := float64(len(b))\n\tmatchRange := int(math.Max(0, math.Floor(math.Max(la, lb)/2.0)-1))\n\tmatchesA := make([]bool, len(a))\n\tmatchesB := make([]bool, len(b))\n\tvar matches float64 = 0\n\n\t// Step 1: Matches\n\t// Loop through each character of the first string,\n\t// looking for a matching character in the second string.\n\tfor i := 0; i < len(a); i++ {\n\t\tstart := int(math.Max(0, float64(i-matchRange)))\n\t\tend := int(math.Min(lb-1, float64(i+matchRange)))\n\n\t\tfor j := start; j <= end; j++ {\n\t\t\tif matchesB[j] {\n\t\t\t\tcontinue\n\t\t\t}\n\n\t\t\tif a[i] == b[j] {\n\t\t\t\tmatchesA[i] = true\n\t\t\t\tmatchesB[j] = true\n\t\t\t\tmatches++\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t}\n\n\t// If there are no matches, strings are completely different,\n\t// therefore return 0.\n\tif matches == 0 {\n\t\treturn 0\n\t}\n\n\t// Step 2: Transpositions\n\t// Loop through the matches' arrays, looking for\n\t// unaligned matches. Count the number of unaligned matches.\n\tunaligned := 0\n\tj := 0\n\tfor i := 0; i < len(a); i++ {\n\t\tif !matchesA[i] {\n\t\t\tcontinue\n\t\t}\n\n\t\tfor !matchesB[j] {\n\t\t\tj++\n\t\t}\n\n\t\tif a[i] != b[j] {\n\t\t\tunaligned++\n\t\t}\n\n\t\tj++\n\t}\n\n\t// The number of unaligned matches divided by two, is the number of _transpositions_.\n\ttranspositions := math.Floor(float64(unaligned) / 2)\n\n\t// Jaro distance is the average between these three numbers:\n\t// 1. matches / length of string A\n\t// 2. matches / length of string B\n\t// 3. (matches - transpositions/matches)\n\t// So, all that divided by three is the final result.\n\treturn ((matches / la) + (matches / lb) + ((matches - transpositions) / matches)) / 3.0\n}\n"
  },
  {
    "path": "soundex.go",
    "content": "package smetrics\n\nimport (\n\t\"strings\"\n)\n\n// The Soundex encoding. It is a phonetic algorithm that considers how the words sound in English. Soundex maps a string to a 4-byte code consisting of the first letter of the original string and three numbers. Strings that sound similar should map to the same code.\nfunc Soundex(s string) string {\n\tb := strings.Builder{}\n\tb.Grow(4)\n\n\tp := s[0]\n\tif p <= 'z' && p >= 'a' {\n\t\tp -= 32 // convert to uppercase\n\t}\n\tb.WriteByte(p)\n\n\tn := 0\n\tfor i := 1; i < len(s); i++ {\n\t\tc := s[i]\n\n\t\tif c <= 'z' && c >= 'a' {\n\t\t\tc -= 32 // convert to uppercase\n\t\t} else if c < 'A' || c > 'Z' {\n\t\t\tcontinue\n\t\t}\n\n\t\tif c == p {\n\t\t\tcontinue\n\t\t}\n\n\t\tp = c\n\n\t\tswitch c {\n\t\tcase 'B', 'P', 'F', 'V':\n\t\t\tc = '1'\n\t\tcase 'C', 'S', 'K', 'G', 'J', 'Q', 'X', 'Z':\n\t\t\tc = '2'\n\t\tcase 'D', 'T':\n\t\t\tc = '3'\n\t\tcase 'L':\n\t\t\tc = '4'\n\t\tcase 'M', 'N':\n\t\t\tc = '5'\n\t\tcase 'R':\n\t\t\tc = '6'\n\t\tdefault:\n\t\t\tcontinue\n\t\t}\n\n\t\tb.WriteByte(c)\n\t\tn++\n\t\tif n == 3 {\n\t\t\tbreak\n\t\t}\n\t}\n\n\tfor i := n; i < 3; i++ {\n\t\tb.WriteByte('0')\n\t}\n\n\treturn b.String()\n}\n"
  },
  {
    "path": "tests/Makefile",
    "content": ".PHONY : test\ntest :\n\tgo test -v\n\n.PHONY : gdb\ngdb :\n\tgo test -c -s -N -l\n\tgdb ./tests.test\n"
  },
  {
    "path": "tests/hamming_test.go",
    "content": "package tests\n\nimport (\n\t\"fmt\"\n\t\"github.com/xrash/smetrics\"\n\t\"testing\"\n)\n\nfunc TestHamming(t *testing.T) {\n\tcases := []hammingcase{\n\t\t{\"a\", \"a\", 0},\n\t\t{\"a\", \"b\", 1},\n\t\t{\"AAAA\", \"AABB\", 2},\n\t\t{\"BAAA\", \"AAAA\", 1},\n\t\t{\"BAAA\", \"CCCC\", 4},\n\t\t{\"karolin\", \"kathrin\", 3},\n\t\t{\"karolin\", \"kerstin\", 3},\n\t\t{\"1011101\", \"1001001\", 2},\n\t\t{\"2173896\", \"2233796\", 3},\n\t}\n\n\tfor _, c := range cases {\n\t\tr, err := smetrics.Hamming(c.a, c.b)\n\t\tif err != nil {\n\t\t\tt.Fatalf(\"got error from hamming err=%s\", err)\n\t\t}\n\t\tif r != c.diff {\n\t\t\tfmt.Println(r, \"instead of\", c.diff)\n\t\t\tt.Fail()\n\t\t}\n\t}\n}\n\nfunc TestHammingError(t *testing.T) {\n\tres, err := smetrics.Hamming(\"a\", \"bbb\")\n\tif err == nil {\n\t\tt.Fatalf(\"expected error from 'a' and 'bbb' on hamming\")\n\t}\n\tif res != -1 {\n\t\tt.Fatalf(\"erroring response wasn't -1, but %d\", res)\n\t}\n}\n"
  },
  {
    "path": "tests/jaro-winkler_test.go",
    "content": "package tests\n\nimport (\n\t\"fmt\"\n\t\"github.com/xrash/smetrics\"\n\t\"testing\"\n)\n\nfunc TestJaroWinkler(t *testing.T) {\n\tfor _, c := range __jaro_winkler_cases {\n\t\tr := smetrics.JaroWinkler(c.a, c.b, 0.7, 4)\n\t\tresult := fmt.Sprintf(\"%.3f\", r)\n\t\texpected := fmt.Sprintf(\"%.3f\", c.r)\n\t\tif result != expected {\n\t\t\tfmt.Println(c.a, c.b, result, \"instead of\", expected)\n\t\t\tt.Fail()\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "tests/jaro_test.go",
    "content": "package tests\n\nimport (\n\t\"fmt\"\n\t\"github.com/xrash/smetrics\"\n\t\"testing\"\n)\n\nfunc TestJaro(t *testing.T) {\n\tfor _, c := range __jaro_cases {\n\t\tr := smetrics.Jaro(c.a, c.b)\n\t\tresult := fmt.Sprintf(\"%.3f\", r)\n\t\texpected := fmt.Sprintf(\"%.3f\", c.r)\n\t\tif result != expected {\n\t\t\tfmt.Println(c.a, c.b, result, \"instead of\", expected)\n\t\t\tt.Fail()\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "tests/soundex_test.go",
    "content": "package tests\n\nimport (\n\t\"fmt\"\n\t\"github.com/xrash/smetrics\"\n\t\"testing\"\n)\n\nfunc TestSoundex(t *testing.T) {\n\tcases := []soundexcase{\n\t\t{\"Euler\", \"E460\"},\n\t\t{\"Ellery\", \"E460\"},\n\t\t{\"Gauss\", \"G200\"},\n\t\t{\"Ghosh\", \"G200\"},\n\t\t{\"Hilbert\", \"H416\"},\n\t\t{\"Heilbrohn\", \"H416\"},\n\t\t{\"Knuth\", \"K530\"},\n\t\t{\"Kant\", \"K530\"},\n\t\t{\"Lloyd\", \"L300\"},\n\t\t{\"Ladd\", \"L300\"},\n\t\t{\"Lukasiewicz\", \"L222\"},\n\t\t{\"Lissjous\", \"L222\"},\n\t\t{\"Ravi\", \"R100\"},\n\t\t{\"Ravee\", \"R100\"},\n\t}\n\n\tfor _, c := range cases {\n\t\tif r := smetrics.Soundex(c.s); r != c.t {\n\t\t\tfmt.Println(r, \"instead of\", c.t, \"for\", c.s)\n\t\t\tt.Fail()\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "tests/testcases.go",
    "content": "package tests\n\ntype levenshteincase struct {\n\ts     string\n\tt     string\n\ticost int\n\tdcost int\n\tscost int\n\tr     int\n}\n\ntype soundexcase struct {\n\ts string\n\tt string\n}\n\ntype hammingcase struct {\n\ta    string\n\tb    string\n\tdiff int\n}\n\ntype jarocase struct {\n\ta string\n\tb string\n\tr float64\n}\n\nvar __jaro_cases = []*jarocase{\n\t{a: \"SHACKLEFORD\", b: \"SHACKELFORD\", r: 0.970},\n\t{a: \"DUNNINGHAM\", b: \"CUNNIGHAM\", r: 0.896},\n\t{a: \"NICHLESON\", b: \"NICHULSON\", r: 0.926},\n\t{a: \"JONES\", b: \"JOHNSON\", r: 0.790},\n\t{a: \"MASSEY\", b: \"MASSIE\", r: 0.889},\n\t{a: \"ABROMS\", b: \"ABRAMS\", r: 0.889},\n\t{a: \"HARDIN\", b: \"MARTINEZ\", r: 0.722},\n\t{a: \"ITMAN\", b: \"SMITH\", r: 0.467},\n\t{a: \"JERALDINE\", b: \"GERALDINE\", r: 0.926},\n\t{a: \"MARHTA\", b: \"MARTHA\", r: 0.944},\n\t{a: \"MICHELLE\", b: \"MICHAEL\", r: 0.869},\n\t{a: \"JULIES\", b: \"JULIUS\", r: 0.889},\n\t{a: \"TANYA\", b: \"TONYA\", r: 0.867},\n\t{a: \"DWAYNE\", b: \"DUANE\", r: 0.822},\n\t{a: \"SEAN\", b: \"SUSAN\", r: 0.783},\n\t{a: \"JON\", b: \"JOHN\", r: 0.917},\n\t//\t{a: \"JON\", b: \"JAN\", r: 0.000},\n\t{a: \"BROOKHAVEN\", b: \"BRROKHAVEN\", r: 0.933},\n\t{a: \"BROOK HALLOW\", b: \"BROOK HLLW\", r: 0.944},\n\t{a: \"DECATUR\", b: \"DECATIR\", r: 0.905},\n\t{a: \"FITZRUREITER\", b: \"FITZENREITER\", r: 0.856},\n\t{a: \"HIGBEE\", b: \"HIGHEE\", r: 0.889},\n\t{a: \"HIGBEE\", b: \"HIGVEE\", r: 0.889},\n\t{a: \"LACURA\", b: \"LOCURA\", r: 0.889},\n\t{a: \"IOWA\", b: \"IONA\", r: 0.833},\n\t//\t{a: \"1ST\", b: \"IST\", r: 0.000},\n\n\t// Equal strings.\n\t{a: \"\", b: \"\", r: 1.000},\n\t{a: \"A\", b: \"A\", r: 1.000},\n\t{a: \"AA\", b: \"AA\", r: 1.000},\n\t{a: \"AAA\", b: \"AAA\", r: 1.000},\n\t{a: \"AAAA\", b: \"AAAA\", r: 1.000},\n\t{a: \"AAAAA\", b: \"AAAAA\", r: 1.000},\n\t{a: \"AAAAAA\", b: \"AAAAAA\", r: 1.000},\n\t{\n\t\ta: \"Legend of the Galactic Heroes\",\n\t\tb: \"Legend of the Galactic Heroes\",\n\t\tr: 1.000,\n\t},\n\t{\n\t\ta: \"Home is the place where, when you have to go there, they have to take you in.\",\n\t\tb: \"Home is the place where, when you have to go there, they have to take you in.\",\n\t\tr: 1.000,\n\t},\n\t{\n\t\ta: \"Pedro de Alcântara João Carlos Leopoldo Salvador Bibiano Francisco Xavier de Paula Leocádio Miguel Gabriel Rafael Gonzaga de Habsburgo-Lorena e Bragança\",\n\t\tb: \"Pedro de Alcântara João Carlos Leopoldo Salvador Bibiano Francisco Xavier de Paula Leocádio Miguel Gabriel Rafael Gonzaga de Habsburgo-Lorena e Bragança\",\n\t\tr: 1.000,\n\t},\n\t{\n\t\ta: \"Et tu, Brute\",\n\t\tb: \"Et tu, Brute\",\n\t\tr: 1.000,\n\t},\n\n\t// Completely different strings.\n\t{a: \"\", b: \"A\", r: 0.000},\n\t{a: \"\", b: \"AA\", r: 0.000},\n\t{a: \"\", b: \"AAA\", r: 0.000},\n\t{a: \"\", b: \"AAAA\", r: 0.000},\n\t{a: \"\", b: \"AAAAA\", r: 0.000},\n\t{a: \"A\", b: \"\", r: 0.000},\n\t{a: \"AA\", b: \"\", r: 0.000},\n\t{a: \"AAA\", b: \"\", r: 0.000},\n\t{a: \"AAAA\", b: \"\", r: 0.000},\n\t{a: \"AAAAA\", b: \"\", r: 0.000},\n\t{a: \"A\", b: \"B\", r: 0.000},\n\t{a: \"AA\", b: \"BB\", r: 0.000},\n\t{a: \"AAA\", b: \"BBB\", r: 0.000},\n\t{a: \"AAAA\", b: \"BBBB\", r: 0.000},\n\t{a: \"AAAAa\", b: \"BBBBB\", r: 0.000},\n}\n\nvar __jaro_winkler_cases = []*jarocase{\n\t{a: \"SHACKLEFORD\", b: \"SHACKELFORD\", r: 0.982},\n\t{a: \"DUNNINGHAM\", b: \"CUNNIGHAM\", r: 0.896},\n\t{a: \"NICHLESON\", b: \"NICHULSON\", r: 0.956},\n\t{a: \"JONES\", b: \"JOHNSON\", r: 0.832},\n\t{a: \"MASSEY\", b: \"MASSIE\", r: 0.933},\n\t{a: \"ABROMS\", b: \"ABRAMS\", r: 0.922},\n\t{a: \"HARDIN\", b: \"MARTINEZ\", r: 0.722},\n\t{a: \"ITMAN\", b: \"SMITH\", r: 0.467},\n\t{a: \"JERALDINE\", b: \"GERALDINE\", r: 0.926},\n\t{a: \"MARHTA\", b: \"MARTHA\", r: 0.961},\n\t{a: \"MICHELLE\", b: \"MICHAEL\", r: 0.921},\n\t{a: \"JULIES\", b: \"JULIUS\", r: 0.933},\n\t{a: \"TANYA\", b: \"TONYA\", r: 0.880},\n\t{a: \"DWAYNE\", b: \"DUANE\", r: 0.840},\n\t{a: \"SEAN\", b: \"SUSAN\", r: 0.805},\n\t{a: \"JON\", b: \"JOHN\", r: 0.933},\n\t//\t{a: \"JON\", b: \"JAN\", r: 0.000},\n\t{a: \"BROOKHAVEN\", b: \"BRROKHAVEN\", r: 0.947},\n\t{a: \"BROOK HALLOW\", b: \"BROOK HLLW\", r: 0.967},\n\t{a: \"DECATUR\", b: \"DECATIR\", r: 0.943},\n\t{a: \"FITZRUREITER\", b: \"FITZENREITER\", r: 0.913},\n\t{a: \"HIGBEE\", b: \"HIGHEE\", r: 0.922},\n\t{a: \"HIGBEE\", b: \"HIGVEE\", r: 0.922},\n\t{a: \"LACURA\", b: \"LOCURA\", r: 0.900},\n\t{a: \"IOWA\", b: \"IONA\", r: 0.867},\n\t//\t{a: \"1ST\", b: \"IST\", r: 0.000},\n\t{a: \"w\", b: \"w\", r: 1.000},\n\n\t// Equal strings.\n\t{a: \"\", b: \"\", r: 1.000},\n\t{a: \"A\", b: \"A\", r: 1.000},\n\t{a: \"AA\", b: \"AA\", r: 1.000},\n\t{a: \"AAA\", b: \"AAA\", r: 1.000},\n\t{a: \"AAAA\", b: \"AAAA\", r: 1.000},\n\t{a: \"AAAAA\", b: \"AAAAA\", r: 1.000},\n\t{a: \"AAAAAA\", b: \"AAAAAA\", r: 1.000},\n\t{\n\t\ta: \"Legend of the Galactic Heroes\",\n\t\tb: \"Legend of the Galactic Heroes\",\n\t\tr: 1.000,\n\t},\n\t{\n\t\ta: \"Home is the place where, when you have to go there, they have to take you in.\",\n\t\tb: \"Home is the place where, when you have to go there, they have to take you in.\",\n\t\tr: 1.000,\n\t},\n\t{\n\t\ta: \"Pedro de Alcântara João Carlos Leopoldo Salvador Bibiano Francisco Xavier de Paula Leocádio Miguel Gabriel Rafael Gonzaga de Habsburgo-Lorena e Bragança\",\n\t\tb: \"Pedro de Alcântara João Carlos Leopoldo Salvador Bibiano Francisco Xavier de Paula Leocádio Miguel Gabriel Rafael Gonzaga de Habsburgo-Lorena e Bragança\",\n\t\tr: 1.000,\n\t},\n\t{\n\t\ta: \"Et tu, Brute\",\n\t\tb: \"Et tu, Brute\",\n\t\tr: 1.000,\n\t},\n\n\t// Completely different strings.\n\t{a: \"\", b: \"A\", r: 0.000},\n\t{a: \"\", b: \"AA\", r: 0.000},\n\t{a: \"\", b: \"AAA\", r: 0.000},\n\t{a: \"\", b: \"AAAA\", r: 0.000},\n\t{a: \"\", b: \"AAAAA\", r: 0.000},\n\t{a: \"A\", b: \"\", r: 0.000},\n\t{a: \"AA\", b: \"\", r: 0.000},\n\t{a: \"AAA\", b: \"\", r: 0.000},\n\t{a: \"AAAA\", b: \"\", r: 0.000},\n\t{a: \"AAAAA\", b: \"\", r: 0.000},\n\t{a: \"A\", b: \"B\", r: 0.000},\n\t{a: \"AA\", b: \"BB\", r: 0.000},\n\t{a: \"AAA\", b: \"BBB\", r: 0.000},\n\t{a: \"AAAA\", b: \"BBBB\", r: 0.000},\n\t{a: \"AAAAa\", b: \"BBBBB\", r: 0.000},\n}\n"
  },
  {
    "path": "tests/ukkonen_test.go",
    "content": "package tests\n\nimport (\n\t\"fmt\"\n\t\"github.com/xrash/smetrics\"\n\t\"testing\"\n)\n\nfunc TestUkkonen(t *testing.T) {\n\tcases := []levenshteincase{\n\t\t{\"RASH\", \"RASH\", 1, 1, 2, 0},\n\t\t{\"POTATO\", \"POTTATO\", 1, 1, 2, 1},\n\t\t{\"POTTATO\", \"POTATO\", 1, 1, 2, 1},\n\t\t{\"HOUSE\", \"MOUSE\", 1, 1, 2, 2},\n\t\t{\"MOUSE\", \"HOUSE\", 2, 2, 4, 4},\n\t\t{\"abc\", \"xy\", 2, 3, 5, 13},\n\t\t{\"xy\", \"abc\", 2, 3, 5, 12},\n\t}\n\n\tfor _, c := range cases {\n\t\tif r := smetrics.Ukkonen(c.s, c.t, c.icost, c.dcost, c.scost); r != c.r {\n\t\t\tfmt.Println(r, \"instead of\", c.r)\n\t\t\tt.Fail()\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "tests/wagner-fischer_test.go",
    "content": "package tests\n\nimport (\n\t\"fmt\"\n\t\"github.com/xrash/smetrics\"\n\t\"testing\"\n)\n\nfunc TestWagnerFischer(t *testing.T) {\n\tcases := []levenshteincase{\n\t\t{\"RASH\", \"RASH\", 1, 1, 2, 0},\n\t\t{\"POTATO\", \"POTTATO\", 1, 1, 2, 1},\n\t\t{\"POTTATO\", \"POTATO\", 1, 1, 2, 1},\n\t\t{\"HOUSE\", \"MOUSE\", 1, 1, 2, 2},\n\t\t{\"MOUSE\", \"HOUSE\", 2, 2, 4, 4},\n\t\t{\"abc\", \"xy\", 2, 3, 5, 13},\n\t\t{\"xy\", \"abc\", 2, 3, 5, 12},\n\t}\n\n\tfor _, c := range cases {\n\t\tif r := smetrics.WagnerFischer(c.s, c.t, c.icost, c.dcost, c.scost); r != c.r {\n\t\t\tfmt.Println(r, \"instead of\", c.r)\n\t\t\tt.Fail()\n\t\t}\n\t}\n}\n"
  },
  {
    "path": "ukkonen.go",
    "content": "package smetrics\n\nimport (\n\t\"math\"\n)\n\n// The Ukkonen algorithm for calculating the Levenshtein distance. The algorithm is described in http://www.cs.helsinki.fi/u/ukkonen/InfCont85.PDF, or in docs/InfCont85.PDF. It runs on O(t . min(m, n)) where t is the actual distance between strings a and b. It needs O(min(t, m, n)) space. This function might be preferred over WagnerFischer() for *very* similar strings. But test it out yourself.\n// The first two parameters are the two strings to be compared. The last three parameters are the insertion cost, the deletion cost and the substitution cost. These are normally defined as 1, 1 and 2 respectively.\nfunc Ukkonen(a, b string, icost, dcost, scost int) int {\n\tvar lowerCost int\n\n\tif icost < dcost && icost < scost {\n\t\tlowerCost = icost\n\t} else if dcost < scost {\n\t\tlowerCost = dcost\n\t} else {\n\t\tlowerCost = scost\n\t}\n\n\tinfinite := math.MaxInt32 / 2\n\n\tvar r []int\n\tvar k, kprime, p, t int\n\tvar ins, del, sub int\n\n\tif len(a) > len(b) {\n\t\tt = (len(a) - len(b) + 1) * lowerCost\n\t} else {\n\t\tt = (len(b) - len(a) + 1) * lowerCost\n\t}\n\n\tfor {\n\t\tif (t / lowerCost) < (len(b) - len(a)) {\n\t\t\tcontinue\n\t\t}\n\n\t\t// This is the right damn thing since the original Ukkonen\n\t\t// paper minimizes the expression result only, but the uncommented version\n\t\t// doesn't need to deal with floats so it's faster.\n\t\t// p = int(math.Floor(0.5*((float64(t)/float64(lowerCost)) - float64(len(b) - len(a)))))\n\t\tp = ((t / lowerCost) - (len(b) - len(a))) / 2\n\n\t\tk = -p\n\t\tkprime = k\n\n\t\trowlength := (len(b) - len(a)) + (2 * p)\n\n\t\tr = make([]int, rowlength+2)\n\n\t\tfor i := 0; i < rowlength+2; i++ {\n\t\t\tr[i] = infinite\n\t\t}\n\n\t\tfor i := 0; i <= len(a); i++ {\n\t\t\tfor j := 0; j <= rowlength; j++ {\n\t\t\t\tif i == j+k && i == 0 {\n\t\t\t\t\tr[j] = 0\n\t\t\t\t} else {\n\t\t\t\t\tif j-1 < 0 {\n\t\t\t\t\t\tins = infinite\n\t\t\t\t\t} else {\n\t\t\t\t\t\tins = r[j-1] + icost\n\t\t\t\t\t}\n\n\t\t\t\t\tdel = r[j+1] + dcost\n\t\t\t\t\tsub = r[j] + scost\n\n\t\t\t\t\tif i-1 < 0 || i-1 >= len(a) || j+k-1 >= len(b) || j+k-1 < 0 {\n\t\t\t\t\t\tsub = infinite\n\t\t\t\t\t} else if a[i-1] == b[j+k-1] {\n\t\t\t\t\t\tsub = r[j]\n\t\t\t\t\t}\n\n\t\t\t\t\tif ins < del && ins < sub {\n\t\t\t\t\t\tr[j] = ins\n\t\t\t\t\t} else if del < sub {\n\t\t\t\t\t\tr[j] = del\n\t\t\t\t\t} else {\n\t\t\t\t\t\tr[j] = sub\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\tk++\n\t\t}\n\n\t\tif r[(len(b)-len(a))+(2*p)+kprime] <= t {\n\t\t\tbreak\n\t\t} else {\n\t\t\tt *= 2\n\t\t}\n\t}\n\n\treturn r[(len(b)-len(a))+(2*p)+kprime]\n}\n"
  },
  {
    "path": "wagner-fischer.go",
    "content": "package smetrics\n\n// The Wagner-Fischer algorithm for calculating the Levenshtein distance.\n// The first two parameters are the two strings to be compared. The last three parameters are the insertion cost, the deletion cost and the substitution cost. These are normally defined as 1, 1 and 2 respectively.\nfunc WagnerFischer(a, b string, icost, dcost, scost int) int {\n\n\t// Allocate both rows.\n\trow1 := make([]int, len(b)+1)\n\trow2 := make([]int, len(b)+1)\n\tvar tmp []int\n\n\t// Initialize the first row.\n\tfor i := 1; i <= len(b); i++ {\n\t\trow1[i] = i * icost\n\t}\n\n\t// For each row...\n\tfor i := 1; i <= len(a); i++ {\n\t\trow2[0] = i * dcost\n\n\t\t// For each column...\n\t\tfor j := 1; j <= len(b); j++ {\n\t\t\tif a[i-1] == b[j-1] {\n\t\t\t\trow2[j] = row1[j-1]\n\t\t\t} else {\n\t\t\t\tins := row2[j-1] + icost\n\t\t\t\tdel := row1[j] + dcost\n\t\t\t\tsub := row1[j-1] + scost\n\n\t\t\t\tif ins < del && ins < sub {\n\t\t\t\t\trow2[j] = ins\n\t\t\t\t} else if del < sub {\n\t\t\t\t\trow2[j] = del\n\t\t\t\t} else {\n\t\t\t\t\trow2[j] = sub\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\t// Swap the rows at the end of each row.\n\t\ttmp = row1\n\t\trow1 = row2\n\t\trow2 = tmp\n\t}\n\n\t// Because we swapped the rows, the final result is in row1 instead of row2.\n\treturn row1[len(row1)-1]\n}\n"
  }
]