Repository: rrice/java-string-similarity Branch: master Commit: 379f3c6ae55e Files: 29 Total size: 68.3 KB Directory structure: gitextract_v7jyh2ep/ ├── .github/ │ └── workflows/ │ └── gradle.yml ├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── README.md ├── build.gradle ├── gradle/ │ └── wrapper/ │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── pom.xml └── src/ ├── main/ │ └── java/ │ └── net/ │ └── ricecode/ │ └── similarity/ │ ├── AscendingSimilarityScoreComparator.java │ ├── DescendingSimilarityScoreComparator.java │ ├── DiceCoefficientStrategy.java │ ├── JaroStrategy.java │ ├── JaroWinklerStrategy.java │ ├── LevenshteinDistanceStrategy.java │ ├── SimilarityScore.java │ ├── SimilarityStrategy.java │ ├── StringSimilarityService.java │ └── StringSimilarityServiceImpl.java └── test/ └── java/ └── net/ └── ricecode/ └── similarity/ ├── AscendingComparatorTest.java ├── DescendingComparatorTest.java ├── DiceCoefficientStrategyTest.java ├── JaroStrategyTest.java ├── JaroWinklerStrategyTest.java ├── LevenshteinDistanceStrategyTest.java ├── SimilarityScoreTest.java └── StringSimilarityServiceImplTest.java ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/gradle.yml ================================================ # This workflow will build a Java project with Gradle # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-gradle name: Java CI with Gradle on: push: branches: [ master ] pull_request: branches: [ master ] jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - name: Set up JDK 1.8 uses: actions/setup-java@v1 with: java-version: 1.8 - name: Grant execute permission for gradlew run: chmod +x gradlew - name: Build with Gradle run: ./gradlew test ================================================ FILE: .gitignore ================================================ # General files *~ *.lock *.DS_Store *.Trashes *.swp *.out *.bak # SVN .svn # Maven .m2 target/ # Eclipse .project .metadata bin/** tmp/** tmp/**/* .classpath .settings/ .loadpath local.properties *~.nib *.launch .externalToolBuilders/ # Intellij *.iml *.ipr *.iws .idea/ # Netbeans nbproject/private/ build/ nbbuild/ dist/ nbdist/ nbactions.xml nb-configuration.xml # Java *.class *.jar *.war *.ear *.db .gradle # Exceptions !/gradle/** ================================================ FILE: .travis.yml ================================================ language: java jdk: - openjdk8 - openjdk10 - openjdk11 ================================================ FILE: LICENSE.txt ================================================ Copyright (c) 2010 Ralph Allan Rice Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ [![License: MIT](https://img.shields.io/github/license/rrice/java-string-similarity)](https://opensource.org/licenses/MIT) [![Issues](https://img.shields.io/github/issues/rrice/java-string-similarity)](https://github.com/rrice/java-string-similarity/actions) ![Java CI](https://github.com/rrice/java-string-similarity/workflows/Java%20CI%20with%20Gradle/badge.svg) [java-string-similarity](https://github.com/rrice/java-string-similarity) that calculates a normalized distance or similarity score between two strings. A score of 0.0 means that the two strings are absolutely dissimilar, and 1.0 means that absolutely similar (or equal). Anything in between indicates how similar each the two strings are. Example ------- In this simple example, we want to calculate a similarity score between the words `McDonalds` and `MacMahons`. We are selecting the [Jaro-Winkler distance algorithm](http://www.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm. ```java SimilarityStrategy strategy = new JaroWinklerStrategy(); String target = "McDonalds"; String source = "MacMahons"; StringSimilarityService service = new StringSimilarityServiceImpl(strategy); double score = service.score(source, target); // Score is 0.90 ``` Algorithms ---------- * [Jaro distance](http://www.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) * [Jaro-Winkler distance](http://www.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) * [Levenshtein distance](http://www.wikipedia.org/wiki/Levenshtein_distance) * [Sørensen–Dice coefficient](http://www.wikipedia.org/wiki/Sørensen–Dice_coefficient) Installation ------------ This project currently uses [Maven](http://maven.apache.org/) for management. You can compile, test and install the component to your local repo by calling: ``` mvn install ``` Then, you can add this component to your project by adding a dependency: ``` net.ricecode string-similarity 1.0.0 ``` TODO ---- * Ant/Ivy build scripts. * [Jaccard index](http://www.wikipedia.org/wiki/Jaccard_index) ================================================ FILE: build.gradle ================================================ apply plugin: 'java' repositories { mavenCentral() } dependencies { testCompile 'junit:junit:4.+', 'org.mockito:mockito-all:1.9.5' } /* * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ ================================================ FILE: gradle/wrapper/gradle-wrapper.properties ================================================ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists ================================================ FILE: gradlew ================================================ #!/usr/bin/env sh ############################################################################## ## ## Gradle start up script for UN*X ## ############################################################################## # Attempt to set APP_HOME # Resolve links: $0 may be a link PRG="$0" # Need this for relative symlinks. while [ -h "$PRG" ] ; do ls=`ls -ld "$PRG"` link=`expr "$ls" : '.*-> \(.*\)$'` if expr "$link" : '/.*' > /dev/null; then PRG="$link" else PRG=`dirname "$PRG"`"/$link" fi done SAVED="`pwd`" cd "`dirname \"$PRG\"`/" >/dev/null APP_HOME="`pwd -P`" cd "$SAVED" >/dev/null APP_NAME="Gradle" APP_BASE_NAME=`basename "$0"` # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. DEFAULT_JVM_OPTS="" # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD="maximum" warn () { echo "$*" } die () { echo echo "$*" echo exit 1 } # OS specific support (must be 'true' or 'false'). cygwin=false msys=false darwin=false nonstop=false case "`uname`" in CYGWIN* ) cygwin=true ;; Darwin* ) darwin=true ;; MINGW* ) msys=true ;; NONSTOP* ) nonstop=true ;; esac CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar # Determine the Java command to use to start the JVM. if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables JAVACMD="$JAVA_HOME/jre/sh/java" else JAVACMD="$JAVA_HOME/bin/java" fi if [ ! -x "$JAVACMD" ] ; then die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi else JAVACMD="java" which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi # Increase the maximum file descriptors if we can. if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then MAX_FD_LIMIT=`ulimit -H -n` if [ $? -eq 0 ] ; then if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then MAX_FD="$MAX_FD_LIMIT" fi ulimit -n $MAX_FD if [ $? -ne 0 ] ; then warn "Could not set maximum file descriptor limit: $MAX_FD" fi else warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" fi fi # For Darwin, add options to specify how the application appears in the dock if $darwin; then GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" fi # For Cygwin, switch paths to Windows format before running java if $cygwin ; then APP_HOME=`cygpath --path --mixed "$APP_HOME"` CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` JAVACMD=`cygpath --unix "$JAVACMD"` # We build the pattern for arguments to be converted via cygpath ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` SEP="" for dir in $ROOTDIRSRAW ; do ROOTDIRS="$ROOTDIRS$SEP$dir" SEP="|" done OURCYGPATTERN="(^($ROOTDIRS))" # Add a user-defined pattern to the cygpath arguments if [ "$GRADLE_CYGPATTERN" != "" ] ; then OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" fi # Now convert the arguments - kludge to limit ourselves to /bin/sh i=0 for arg in "$@" ; do CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` else eval `echo args$i`="\"$arg\"" fi i=$((i+1)) done case $i in (0) set -- ;; (1) set -- "$args0" ;; (2) set -- "$args0" "$args1" ;; (3) set -- "$args0" "$args1" "$args2" ;; (4) set -- "$args0" "$args1" "$args2" "$args3" ;; (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; esac fi # Escape application args save () { for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done echo " " } APP_ARGS=$(save "$@") # Collect all arguments for the java command, following the shell quoting and substitution rules eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then cd "$(dirname "$0")" fi exec "$JAVACMD" "$@" ================================================ FILE: gradlew.bat ================================================ @if "%DEBUG%" == "" @echo off @rem ########################################################################## @rem @rem Gradle startup script for Windows @rem @rem ########################################################################## @rem Set local scope for the variables with windows NT shell if "%OS%"=="Windows_NT" setlocal set DIRNAME=%~dp0 if "%DIRNAME%" == "" set DIRNAME=. set APP_BASE_NAME=%~n0 set APP_HOME=%DIRNAME% @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. set DEFAULT_JVM_OPTS= @rem Find java.exe if defined JAVA_HOME goto findJavaFromJavaHome set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 if "%ERRORLEVEL%" == "0" goto init echo. echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. echo. echo Please set the JAVA_HOME variable in your environment to match the echo location of your Java installation. goto fail :findJavaFromJavaHome set JAVA_HOME=%JAVA_HOME:"=% set JAVA_EXE=%JAVA_HOME%/bin/java.exe if exist "%JAVA_EXE%" goto init echo. echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% echo. echo Please set the JAVA_HOME variable in your environment to match the echo location of your Java installation. goto fail :init @rem Get command-line arguments, handling Windows variants if not "%OS%" == "Windows_NT" goto win9xME_args :win9xME_args @rem Slurp the command line arguments. set CMD_LINE_ARGS= set _SKIP=2 :win9xME_args_slurp if "x%~1" == "x" goto execute set CMD_LINE_ARGS=%* :execute @rem Setup the command line set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar @rem Execute Gradle "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% :end @rem End local scope for the variables with windows NT shell if "%ERRORLEVEL%"=="0" goto mainEnd :fail rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of rem the _cmd.exe /c_ return code! if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 exit /b 1 :mainEnd if "%OS%"=="Windows_NT" endlocal :omega ================================================ FILE: pom.xml ================================================ 4.0.0 net.ricecode string-similarity 1.0.1-SNAPSHOT jar string-similarity A Java library that implements several algorithms that calculate similarity between strings. http://www.ricecode.net UTF-8 4.13.1 1.10.19 MIT License http://www.opensource.org/licenses/mit-license.php repo https://github.com/rrice/java-string-similarity/issues GitHub Issues https://github.com/rrice/java-string-similarity scm:git:git://github.com/rrice/java-string-similarity.git scm:git:git@github.com:rrice/java-string-similarity.git HEAD ralph.rice@gmail.com Ralph Allan Rice https://github.com/rrice rrice doclint-java8-disable [1.8,) -Xdoclint:none release org.apache.maven.plugins maven-gpg-plugin 1.6 sign-artifacts verify sign org.mockito mockito-all ${mockito.version} test junit junit ${junit.version} test org.apache.maven.plugins maven-compiler-plugin 3.3 1.5 1.5 false false org.apache.maven.plugins maven-source-plugin 2.4 attach-source jar-no-fork test-jar-no-fork org.apache.maven.plugins maven-javadoc-plugin 2.9.1 public attach-javadocs jar ${javadoc.opts} org.apache.maven.plugins maven-release-plugin 2.5.3 @{project.version} org.sonatype.plugins nexus-staging-maven-plugin 1.6.3 true ossrh https://oss.sonatype.org/ true org.codehaus.mojo versions-maven-plugin 2.2 ================================================ FILE: src/main/java/net/ricecode/similarity/AscendingSimilarityScoreComparator.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import java.util.Comparator; /** * A comparator that allows SimilarityScore to be sorted in * ascending order. * @author Ralph Allan Rice * */ public class AscendingSimilarityScoreComparator implements Comparator { /** * Compares two similarity scores. * @param x The first score to be compared. * @param y The second score to be compared. * @return a negative integer, zero, or a positive integer as the first score is less than, * equal to, or greater than the second score. */ public int compare(SimilarityScore x, SimilarityScore y) { double first = x.getScore(); double second = y.getScore(); if (first == second) { return 0; } if (first < second) { return -1; } return 1; } } ================================================ FILE: src/main/java/net/ricecode/similarity/DescendingSimilarityScoreComparator.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import java.util.Comparator; /** * A comparator that allows SimilarityScore to be sorted in * descending order. * @author Ralph Allan Rice * */ public class DescendingSimilarityScoreComparator implements Comparator { /** * Compares two similarity scores. * @param x The first score to be compared. * @param y The second score to be compared. * @return a negative integer, zero, or a positive integer as the first score is greater than, * equal to, or less than the second score. */public int compare(SimilarityScore x, SimilarityScore y) { double first = x.getScore(); double second = y.getScore(); if (first == second) { return 0; } if (first < second) { return 1; } return -1; } } ================================================ FILE: src/main/java/net/ricecode/similarity/DiceCoefficientStrategy.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import java.util.ArrayList; import java.util.Set; import java.util.TreeSet; /** * A strategy that uses the Dice's Coefficient to calculate the similarity of two strings. * @author Ralph Allan Rice * @see About Dice Coefficient */ public class DiceCoefficientStrategy implements SimilarityStrategy { /** * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity * and 1.0 implies absolute similarity. * * @param first The first string to compare. * @param second The second string to compare. * @return A number between 0.0 and 1.0. */ public double score(String first, String second) { // Create two sets of character bigrams, one for each string. Set s1 = splitIntoBigrams(first); Set s2 = splitIntoBigrams(second); // Get the number of elements in each set. int n1 = s1.size(); int n2 = s2.size(); // Find the intersection, and get the number of elements in that set. s1.retainAll(s2); int nt = s1.size(); // The coefficient is: // // 2 ∙ | s1 ⋂ s2 | // D = ---------------------- // | s1 | + | s2 | // return (2.0 * (double)nt) / ((double)(n1 + n2)); } private Set splitIntoBigrams(String s) { ArrayList bigrams = new ArrayList(); if (s.length() < 2) { bigrams.add(s); } else { for (int i = 1; i < s.length(); i++) { StringBuilder sb = new StringBuilder(); sb.append(s.charAt(i-1)); sb.append(s.charAt(i)); bigrams.add(sb.toString()); } } return new TreeSet(bigrams); } } ================================================ FILE: src/main/java/net/ricecode/similarity/JaroStrategy.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; /** * A strategy that uses the Jaro Distance to calculate the similarity of two strings. * @author Ralph Allan Rice * @see About Jaro Distance */ public class JaroStrategy implements SimilarityStrategy { /** * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity * and 1.0 implies absolute similarity. * * @param first The first string to compare. * @param second The second string to compare. * @return A number between 0.0 and 1.0. */ public double score(String first, String second) { String shorter; String longer; // Determine which String is longer. if (first.length() > second.length()) { longer = first.toLowerCase(); shorter = second.toLowerCase(); } else { longer = second.toLowerCase(); shorter = first.toLowerCase(); } // Calculate the half length() distance of the shorter String. int halflength = (shorter.length() / 2) + 1; // Find the set of matching characters between the shorter and longer strings. Note that // the set of matching characters may be different depending on the order of the strings. String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength); String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength); // If one or both of the sets of common characters is empty, then // there is no similarity between the two strings. if (m1.length() == 0 || m2.length() == 0) return 0.0; // If the set of common characters is not the same size, then // there is no similarity between the two strings, either. if (m1.length() != m2.length()) return 0.0; // Calculate the number of transpositions between the two sets // of common characters. int transpositions = transpositions(m1, m2); // Calculate the distance. double dist = (m1.length() / ((double)shorter.length()) + m2.length() / ((double)longer.length()) + (m1.length() - transpositions) / ((double)m1.length())) / 3.0; return dist; } /** * Gets a set of matching characters between two strings. * * @param first The first string. * @param second The second string. * @param limit The maximum distance to consider. * @return A string contain the set of common characters. * @remarks Two characters from the first string and the second string are considered matching if the character's * respective positions are no farther than the limit value. */ private String getSetOfMatchingCharacterWithin(String first, String second, int limit) { StringBuilder common = new StringBuilder(); StringBuilder copy = new StringBuilder(second); for (int i = 0; i < first.length(); i++) { char ch = first.charAt(i); boolean found = false; // See if the character is within the limit positions away from the original position of that character. for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++) { if (copy.charAt(j) == ch) { found = true; common.append(ch); copy.setCharAt(j,'*'); } } } return common.toString(); } /** * Calculates the number of transpositions between two strings. * @param first The first string. * @param second The second string. * @return The number of transpositions between the two strings. */ private int transpositions(String first, String second) { int transpositions = 0; for (int i = 0; i < first.length(); i++) { if (first.charAt(i) != second.charAt(i)) { transpositions++; } } transpositions /= 2; return transpositions; } } ================================================ FILE: src/main/java/net/ricecode/similarity/JaroWinklerStrategy.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; /** * A strategy that uses the Jaro-Winkler Distance to calculate the similarity of two strings. * * @author Ralph Allan Rice * @see About Jaro-Winkler Distance */ public class JaroWinklerStrategy extends JaroStrategy implements SimilarityStrategy { final double DEFAULT_SCALING_FACTOR = 0.1; // This is the default scaling factor Winkler used. private double scalingFactor; /** * Constructs a new JaroWinklerStrategy instance. * @param scalingFactor The scaling factor between 0.00 and 0.25. If the scaling factor is greater than 0.25, the scaling factor is set to 0.25. */ public JaroWinklerStrategy(double scalingFactor) { if (scalingFactor > 0.25) { scalingFactor = 0.25; } this.scalingFactor = scalingFactor; } /** * Constructs a new JaroWinklerStrategy instance. */ public JaroWinklerStrategy() { this.scalingFactor = DEFAULT_SCALING_FACTOR; } /** * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity * and 1.0 implies absolute similarity. * * @param first The first string to compare. * @param second The second string to compare. * @return A number between 0.0 and 1.0. */ public double score(String first, String second) { double jaro = super.score(first, second); int cl = commonPrefixLength(first, second); // The Jaro–Winkler distance uses a prefix scale which gives more favorable ratings // to strings that match from the beginning for a set prefix length. double winkler = jaro + (scalingFactor * cl * (1.0 - jaro)); return winkler; } /** * Calculates the number of characters from the beginning of the strings that match exactly one-to-one, * up to a maximum of four (4) characters. * @param first The first string. * @param second The second string. * @return A number between 0 and 4. */ private int commonPrefixLength(String first, String second) { String shorter; String longer; // Determine which string is longer. if (first.length() > second.length()) { longer = first.toLowerCase(); shorter = second.toLowerCase(); } else { longer = second.toLowerCase(); shorter = first.toLowerCase(); } int result = 0; // Iterate through the shorter string. for (int i = 0; i < shorter.length(); i++) { if (shorter.charAt(i) != longer.charAt(i)) { break; } result++; } // Limit the result to 4. return result > 4? 4: result; } } ================================================ FILE: src/main/java/net/ricecode/similarity/LevenshteinDistanceStrategy.java ================================================ package net.ricecode.similarity; /** * A strategy that uses the Levenshtein's Distance to calculate the edit distance of two strings. * Then it converts this to a "score" to fit the framework. * * @see About Levenshtein Distance */ public class LevenshteinDistanceStrategy implements SimilarityStrategy { /** * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity * and 1.0 implies absolute similarity. * * @param first The first string to compare. * @param second The second string to compare. * @return A number between 0.0 and 1.0. * @throws NullPointerException if one or both of the strings are null */ public double score(String first, String second) { int maxLength = Math.max(first.length(), second.length()); //Can't divide by 0 if (maxLength == 0) return 1.0d; return ((double) (maxLength - computeEditDistance(first, second))) / (double) maxLength; } protected int computeEditDistance(String first, String second) { first = first.toLowerCase(); second = second.toLowerCase(); int[] costs = new int[second.length() + 1]; for (int i = 0; i <= first.length(); i++) { int previousValue = i; for (int j = 0; j <= second.length(); j++) { if (i == 0) { costs[j] = j; } else if (j > 0) { int useValue = costs[j - 1]; if (first.charAt(i - 1) != second.charAt(j - 1)) { useValue = Math.min(Math.min(useValue, previousValue), costs[j]) + 1; } costs[j - 1] = previousValue; previousValue = useValue; } } if (i > 0) { costs[second.length()] = previousValue; } } return costs[second.length()]; } } ================================================ FILE: src/main/java/net/ricecode/similarity/SimilarityScore.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; /** * A value object contains a similarity score. * @author Ralph Allan Rice * */ public class SimilarityScore { private String key; private double score; /** * Constructs a similarity score. * @param key The string key. * @param score The score value. */ public SimilarityScore(String key, double score) { this.key = key; this.score = score; } /** * Gets the key for this score. * @return A string. */ public String getKey() { return this.key; } /** * Gets the value of the score. * @return A double. */ public double getScore() { return this.score; } /** * Returns the hash code for this object. * @return An integer representing the hash code. */ public int hashCode() { int hash = 11; hash = 23 * hash + key.hashCode(); hash = 23 * hash + (int)(score * 1000000); return hash; } /** * Determines if the supplied object equals this object. * @return True if the keys and scores match between the two objects. Otherwise false. */ @Override public boolean equals(Object o) { if((o == null) || (o.getClass() != this.getClass())) { return false; } SimilarityScore other=(SimilarityScore)o; return this.key.equals(other.key) && this.score == other.score; } } ================================================ FILE: src/main/java/net/ricecode/similarity/SimilarityStrategy.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; /** * @author Ralph Allan Rice * An interface that defines methods to perform string similarity calculation. */ public interface SimilarityStrategy { /** * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity * and 1.0 implies absolute similarity. * * @param first The first string to compare. * @param second The second string to compare. * @return A number between 0.0 and 1.0. */ double score(String first, String second); } ================================================ FILE: src/main/java/net/ricecode/similarity/StringSimilarityService.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import java.util.Comparator; import java.util.List; /** * A service that performs string similarity calculations. * @author Ralph Allan Rice * */ public interface StringSimilarityService { /** * Calculates all similarity scores for a given set of features. * @param features The list of features. * @param target The target string to compare against the features. * @return A list of similarity scores. */ List scoreAll(List features, String target); /** * Calculates the similarity score of a single feature. * @param feature The feature string to compare. * @param target The target string to compare against the feature. * @return The similarity score between the feature and target. */ double score(String feature, String target); /** * Finds the feature within a set of given features that best match the target string. * @param features A list of strings containing the features to compare. * @param target The target string to compare against the features. * @return A SimilarityScore that has the highest score value amongst the features. */ SimilarityScore findTop(List features, String target); /** * Finds the feature within a set of given features that best match the target string. * @param features A list of strings containing the features to compare. * @param target The target string to compare against the features. * @param comparator A comparator that is used sort the scores. * @return A SimilarityScore that has the top value amongst the features, according to the comparator. */ SimilarityScore findTop(List features, String target, Comparator comparator); } ================================================ FILE: src/main/java/net/ricecode/similarity/StringSimilarityServiceImpl.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import java.util.ArrayList; import java.util.Comparator; import java.util.List; import java.util.Collections; /** * An implementation of StringSimilarityService. * @author Ralph Allan Rice * @see StringSimilarityService */ public class StringSimilarityServiceImpl implements StringSimilarityService { private SimilarityStrategy strategy; /** * Creates a similarity calculator instance. * @param strategy The similarity strategy to use when calculating similarity scores. */ public StringSimilarityServiceImpl(SimilarityStrategy strategy) { this.strategy = strategy; } /** * Calculates all similarity scores for a given set of features. * @param features The list of features. * @param target The target string to compare against the features. * @return A list of similarity scores. */ public List scoreAll(List features, String target) { ArrayList scores = new ArrayList(); for(String feature: features) { double score = strategy.score(feature, target); scores.add(new SimilarityScore(feature, score)); } return scores; } /** * Calculates the similarity score of a single feature. * @param feature The feature string to compare. * @param target The target string to compare against the feature. * @return The similarity score between the feature and target. */ public double score(String feature, String target) { return strategy.score(feature, target); } /** * Finds the feature within a set of given features that best match the target string. * @param features A list of strings containing the features to compare. * @param target The target string to compare against the features. * @return The similarity score with the highest value. */ public SimilarityScore findTop(List features, String target) { return findTop(features, target, new DescendingSimilarityScoreComparator()); } /** * Finds the feature within a set of given features that best match the target string. * @param features A list of strings containing the features to compare. * @param target The target string to compare against the features. * @param comparator A comparator that is used sort the scores. * @return A SimilarityScore that has the top value amongst the features, according to the comparator. */ public SimilarityScore findTop(List features, String target, Comparator comparator) { if (features.size() == 0) { return null; } List scores= scoreAll(features, target); Collections.sort(scores, comparator); return scores.get(0); } } ================================================ FILE: src/test/java/net/ricecode/similarity/AscendingComparatorTest.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import static org.junit.Assert.*; import org.junit.Test; public class AscendingComparatorTest { @Test public void testCompareScoreFirstGreater() { SimilarityScore first = new SimilarityScore("First", 0.87); SimilarityScore second = new SimilarityScore("Second", 0.54); AscendingSimilarityScoreComparator c = new AscendingSimilarityScoreComparator(); assertTrue(c.compare(first, second)>0); assertTrue(c.compare(second, first)<0); } @Test public void testCompareScoreSecondGreater() { SimilarityScore first = new SimilarityScore("First", 0.37); SimilarityScore second = new SimilarityScore("Second", 0.65); AscendingSimilarityScoreComparator c = new AscendingSimilarityScoreComparator(); assertTrue(c.compare(first, second)<0); assertTrue(c.compare(second, first)>0); } @Test public void testCompareScoreEquality() { SimilarityScore first = new SimilarityScore("First", 0.96); SimilarityScore second = new SimilarityScore("Second", 0.96); AscendingSimilarityScoreComparator c = new AscendingSimilarityScoreComparator(); assertEquals(c.compare(first, second), 0); assertEquals(c.compare(second, first), 0); } } ================================================ FILE: src/test/java/net/ricecode/similarity/DescendingComparatorTest.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import static org.junit.Assert.*; import org.junit.Test; public class DescendingComparatorTest { @Test public void testCompareScoreFirstGreater() { SimilarityScore first = new SimilarityScore("First", 0.87); SimilarityScore second = new SimilarityScore("Second", 0.54); DescendingSimilarityScoreComparator c = new DescendingSimilarityScoreComparator(); assertTrue(c.compare(first, second)<0); assertTrue(c.compare(second, first)>0); } @Test public void testCompareScoreSecondGreater() { SimilarityScore first = new SimilarityScore("First", 0.37); SimilarityScore second = new SimilarityScore("Second", 0.65); DescendingSimilarityScoreComparator c = new DescendingSimilarityScoreComparator(); assertTrue(c.compare(first, second)>0); assertTrue(c.compare(second, first)<0); } @Test public void testCompareScoreEquality() { SimilarityScore first = new SimilarityScore("First", 0.96); SimilarityScore second = new SimilarityScore("Second", 0.96); DescendingSimilarityScoreComparator c = new DescendingSimilarityScoreComparator(); assertEquals(c.compare(first, second), 0); assertEquals(c.compare(second, first), 0); } } ================================================ FILE: src/test/java/net/ricecode/similarity/DiceCoefficientStrategyTest.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import static org.junit.Assert.*; import org.junit.Test; public class DiceCoefficientStrategyTest { @Test public void testOneTranspostion() { SimilarityStrategy s = new DiceCoefficientStrategy(); String first = "Martha"; String second = "Marhta"; double expected = 0.400; double delta = 0.001; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testSoundAlike() { SimilarityStrategy s = new DiceCoefficientStrategy(); String first = "Dwayne"; String second = "Duane"; double expected = 0.2222; double delta = 0.001; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testMisspelledSoundAlike() { SimilarityStrategy s = new DiceCoefficientStrategy(); String first = "Dixon"; String second = "Dicksonx"; double expected = 0.363636; double delta = 0.001; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testAbsoluteSimilarity() { SimilarityStrategy s = new DiceCoefficientStrategy(); String first = "Mississippi"; String second = "Mississippi"; double expected = 1.000; double delta = 0.000; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testAbsoluteDissimilarity() { SimilarityStrategy s = new DiceCoefficientStrategy(); String first = "Mississippi"; String second = "Oklahoma"; double expected = 0.000; double delta = 0.000; double actual = s.score(first, second); assertEquals(expected, actual, delta); } } ================================================ FILE: src/test/java/net/ricecode/similarity/JaroStrategyTest.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import static org.junit.Assert.*; import org.junit.Test; public class JaroStrategyTest { @Test public void testOneTranspostion() { SimilarityStrategy s = new JaroStrategy(); String first = "Martha"; String second = "Marhta"; double expected = 0.944; double delta = 0.001; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testSoundAlike() { SimilarityStrategy s = new JaroStrategy(); String first = "Dwayne"; String second = "Duane"; double expected = 0.822; double delta = 0.001; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testMisspelledSoundAlike() { SimilarityStrategy s = new JaroStrategy(); String first = "Dixon"; String second = "Dicksonx"; double expected = 0.767; double delta = 0.001; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testAbsoluteSimilarity() { SimilarityStrategy s = new JaroStrategy(); String first = "Mississippi"; String second = "Mississippi"; double expected = 1.000; double delta = 0.000; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testAbsoluteDissimilarity() { SimilarityStrategy s = new JaroStrategy(); String first = "Mississippi"; String second = "Oklahoma"; double expected = 0.000; double delta = 0.000; double actual = s.score(first, second); assertEquals(expected, actual, delta); } } ================================================ FILE: src/test/java/net/ricecode/similarity/JaroWinklerStrategyTest.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import static org.junit.Assert.*; import org.junit.Test; public class JaroWinklerStrategyTest { @Test public void testOneTranspostion() { SimilarityStrategy s = new JaroWinklerStrategy(); String first = "Martha"; String second = "Marhta"; double expected = 0.961; double delta = 0.001; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testSoundAlike() { SimilarityStrategy s = new JaroWinklerStrategy(); String first = "Dwayne"; String second = "Duane"; double expected = 0.840; double delta = 0.001; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testMisspelledSoundAlike() { SimilarityStrategy s = new JaroWinklerStrategy(); String first = "Dixon"; String second = "Dicksonx"; double expected = 0.813; double delta = 0.001; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testAbsoluteSimilarity() { SimilarityStrategy s = new JaroStrategy(); String first = "Mississippi"; String second = "Mississippi"; double expected = 1.000; double delta = 0.000; double actual = s.score(first, second); assertEquals(expected, actual, delta); } @Test public void testAbsoluteDissimilarity() { SimilarityStrategy s = new JaroStrategy(); String first = "Mississippi"; String second = "Oklahoma"; double expected = 0.000; double delta = 0.000; double actual = s.score(first, second); assertEquals(expected, actual, delta); } } ================================================ FILE: src/test/java/net/ricecode/similarity/LevenshteinDistanceStrategyTest.java ================================================ package net.ricecode.similarity; import org.junit.Test; import static org.junit.Assert.*; public class LevenshteinDistanceStrategyTest { @Test (expected = NullPointerException.class) public void testNullThrows() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); lds.score(null, "kEvIn"); } @Test public void emptyStringTest() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("", ""); assertEquals(1.0, response, 0.0); } @Test public void testExactMatchDifferentCase() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("KeViN", "kevin"); assertEquals(1.0, response, 0.0); } @Test public void testExactMatchSameCase() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("java", "java"); assertEquals(1.0, response, 0.0); } @Test public void testNoSimilarity() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("abc", "def"); assertEquals(0.0, response, 0.0); } @Test public void score1() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("he", "head"); assertEquals(0.5d, response, 0.0001d); } @Test public void score2() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("hd", "head"); assertEquals(0.5d, response, 0.0001d); } @Test public void score3() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("d", "head"); assertEquals(0.25d, response, 0.0001d); } @Test public void score4() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("head", "he"); assertEquals(0.5d, response, 0.0001d); } @Test public void score5() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("kitten", "sitting"); assertEquals(0.5714d, response, 0.0001d); } @Test public void score6() { LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); double response = lds.score("Saturday", "Sunday"); assertEquals(0.625d, response, 0.0001d); } } ================================================ FILE: src/test/java/net/ricecode/similarity/SimilarityScoreTest.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import static org.junit.Assert.*; import org.junit.Test; public class SimilarityScoreTest { @Test public void testGetKey() { SimilarityScore s = new SimilarityScore("Test", 0.99); assertEquals("Test", s.getKey()); } @Test public void testGetScore() { SimilarityScore s = new SimilarityScore("Test", 0.99); assertEquals(0.99, s.getScore(), 0.000); } } ================================================ FILE: src/test/java/net/ricecode/similarity/StringSimilarityServiceImplTest.java ================================================ /* * Copyright (c) 2010 Ralph Allan Rice * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * */ package net.ricecode.similarity; import static org.junit.Assert.*; import static org.mockito.Mockito.*; import java.util.ArrayList; import java.util.List; import org.junit.Test; public class StringSimilarityServiceImplTest { @Test public void testScoreAll() { SimilarityStrategy strategy = mock(SimilarityStrategy.class); String target = "McDonalds"; String c1 = "MacMahons"; String c2 = "McPherson"; String c3 = "McDonalds"; when(strategy.score(target, c1)).thenReturn(0.90); when(strategy.score(target, c2)).thenReturn(0.74); when(strategy.score(target, c3)).thenReturn(1.000); StringSimilarityService service = new StringSimilarityServiceImpl(strategy); List features = new ArrayList(); features.add(c1); features.add(c2); features.add(c3); List scores = service.scoreAll(features, target); verify(strategy).score(c1, target); verify(strategy).score(c2, target); verify(strategy).score(c3, target); assertEquals(3, scores.size()); } @Test public void testScore() { SimilarityStrategy strategy = mock(SimilarityStrategy.class); String target = "McDonalds"; String c1 = "MacMahons"; String c2 = "McPherson"; String c3 = "McDonalds"; when(strategy.score(c1, target)).thenReturn(0.90); when(strategy.score(c2, target)).thenReturn(0.74); when(strategy.score(c3, target)).thenReturn(1.000); StringSimilarityService service = new StringSimilarityServiceImpl(strategy); double score = service.score(c1, target); verify(strategy).score(c1, target); assertEquals(0.90, score, 0.000); } @Test public void testFindTop() { SimilarityStrategy strategy = mock(SimilarityStrategy.class); String target = "McDonalds"; String c1 = "MacMahons"; String c2 = "McPherson"; String c3 = "McDonalds"; SimilarityScore expected = new SimilarityScore(c3, 1.000); when(strategy.score(c1, target)).thenReturn(0.90); when(strategy.score(c2, target)).thenReturn(0.74); when(strategy.score(c3, target)).thenReturn(1.000); StringSimilarityService service = new StringSimilarityServiceImpl(strategy); List features = new ArrayList(); features.add(c1); features.add(c2); features.add(c3); SimilarityScore top= service.findTop(features,target); verify(strategy).score(c1, target); verify(strategy).score(c2, target); verify(strategy).score(c3, target); assertEquals(expected, top); } @Test public void testFindTop_Ascending() { SimilarityStrategy strategy = mock(SimilarityStrategy.class); String target = "McDonalds"; String c1 = "MacMahons"; String c2 = "McPherson"; String c3 = "McDonalds"; SimilarityScore expected = new SimilarityScore(c2, 0.74); when(strategy.score(c1, target)).thenReturn(0.90); when(strategy.score(c2, target)).thenReturn(0.74); when(strategy.score(c3, target)).thenReturn(1.000); StringSimilarityService service = new StringSimilarityServiceImpl(strategy); List features = new ArrayList(); features.add(c1); features.add(c2); features.add(c3); AscendingSimilarityScoreComparator comparator = new AscendingSimilarityScoreComparator(); SimilarityScore top= service.findTop(features,target, comparator); verify(strategy).score(c1, target); verify(strategy).score(c2, target); verify(strategy).score(c3, target); assertEquals(expected, top); } @Test public void testFindTop_Descending() { SimilarityStrategy strategy = mock(SimilarityStrategy.class); String target = "McDonalds"; String c1 = "MacMahons"; String c2 = "McPherson"; String c3 = "McDonalds"; SimilarityScore expected = new SimilarityScore(c3, 1.000); when(strategy.score(c1, target)).thenReturn(0.90); when(strategy.score(c2, target)).thenReturn(0.74); when(strategy.score(c3, target)).thenReturn(1.000); StringSimilarityService service = new StringSimilarityServiceImpl(strategy); List features = new ArrayList(); features.add(c1); features.add(c2); features.add(c3); DescendingSimilarityScoreComparator comparator = new DescendingSimilarityScoreComparator(); SimilarityScore top= service.findTop(features,target, comparator); verify(strategy).score(c1, target); verify(strategy).score(c2, target); verify(strategy).score(c3, target); assertEquals(expected, top); } }