Repository: morfologik/morfologik-stemming Branch: master Commit: 3400c20d43f4 Files: 157 Total size: 455.1 KB Directory structure: gitextract_1f6qqk15/ ├── .github/ │ └── workflows/ │ └── ci.yml ├── .gitignore ├── CHANGES.txt ├── CONTRIBUTING.txt ├── LICENSE.txt ├── README.txt ├── etc/ │ ├── eclipse/ │ │ └── settings/ │ │ ├── org.eclipse.jdt.core.prefs │ │ └── org.eclipse.m2e.core.prefs │ └── forbidden-apis/ │ └── signatures.txt ├── morfologik-fsa/ │ ├── pom.xml │ └── src/ │ └── main/ │ └── java/ │ └── morfologik/ │ └── fsa/ │ ├── ByteSequenceIterator.java │ ├── CFSA.java │ ├── CFSA2.java │ ├── FSA.java │ ├── FSA5.java │ ├── FSAFlags.java │ ├── FSAHeader.java │ ├── FSATraversal.java │ ├── MatchResult.java │ └── StateVisitor.java ├── morfologik-fsa-builders/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── morfologik/ │ │ └── fsa/ │ │ └── builders/ │ │ ├── CFSA2Serializer.java │ │ ├── ConstantArcSizeFSA.java │ │ ├── FSA5Serializer.java │ │ ├── FSABuilder.java │ │ ├── FSAInfo.java │ │ ├── FSASerializer.java │ │ └── FSAUtils.java │ └── test/ │ ├── java/ │ │ └── morfologik/ │ │ └── fsa/ │ │ └── builders/ │ │ ├── CFSA2SerializerTest.java │ │ ├── FSA5SerializerTest.java │ │ ├── FSA5Test.java │ │ ├── FSABuilderTest.java │ │ ├── FSATestUtils.java │ │ ├── FSATraversalTest.java │ │ ├── MinMax.java │ │ ├── SerializerTestBase.java │ │ └── TestBase.java │ └── resources/ │ └── morfologik/ │ └── fsa/ │ └── builders/ │ ├── abc-numbers.fsa │ ├── abc.fsa │ ├── abc.in │ ├── en_tst.dict │ ├── minimal.fsa │ ├── minimal.in │ ├── minimal2.fsa │ └── minimal2.in ├── morfologik-polish/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ ├── java/ │ │ │ └── morfologik/ │ │ │ └── stemming/ │ │ │ └── polish/ │ │ │ └── PolishStemmer.java │ │ └── resources/ │ │ └── morfologik/ │ │ └── stemming/ │ │ └── polish/ │ │ ├── polish.LICENSE.Polish.txt │ │ ├── polish.LICENSE.txt │ │ ├── polish.README.Polish.txt │ │ ├── polish.README.txt │ │ ├── polish.dict │ │ └── polish.info │ └── test/ │ └── java/ │ └── morfologik/ │ └── stemming/ │ └── polish/ │ ├── Gh27Test.java │ └── PolishMorfologikStemmerTest.java ├── morfologik-speller/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── morfologik/ │ │ └── speller/ │ │ ├── HMatrix.java │ │ └── Speller.java │ └── test/ │ ├── java/ │ │ └── morfologik/ │ │ └── speller/ │ │ ├── HMatrixTest.java │ │ └── SpellerTest.java │ └── resources/ │ └── morfologik/ │ └── speller/ │ ├── dict-with-freq.dict │ ├── dict-with-freq.info │ ├── dict-with-freq.txt │ ├── issue38.dict │ ├── issue38.info │ ├── issue38.input │ ├── issue94.dict │ ├── issue94.info │ ├── pissara-test.dict │ ├── pissara-test.info │ ├── pissara-test.txt │ ├── reps_dist2.dict │ ├── reps_dist2.info │ ├── reps_dist2.txt │ ├── single-char-word.dict │ ├── single-char-word.info │ ├── slownik.dict │ ├── slownik.info │ ├── test-infix.dict │ ├── test-infix.info │ ├── test-utf-spell.dict │ ├── test-utf-spell.info │ ├── test_freq_iso.dict │ └── test_freq_iso.info ├── morfologik-stemming/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ └── java/ │ │ └── morfologik/ │ │ └── stemming/ │ │ ├── ArrayViewList.java │ │ ├── BufferUtils.java │ │ ├── Dictionary.java │ │ ├── DictionaryAttribute.java │ │ ├── DictionaryIterator.java │ │ ├── DictionaryLookup.java │ │ ├── DictionaryMetadata.java │ │ ├── DictionaryMetadataBuilder.java │ │ ├── EncoderType.java │ │ ├── ISequenceEncoder.java │ │ ├── IStemmer.java │ │ ├── NoEncoder.java │ │ ├── TrimInfixAndSuffixEncoder.java │ │ ├── TrimPrefixAndSuffixEncoder.java │ │ ├── TrimSuffixEncoder.java │ │ ├── UnmappableInputException.java │ │ └── WordData.java │ └── test/ │ ├── java/ │ │ └── morfologik/ │ │ └── stemming/ │ │ ├── DictionaryLookupTest.java │ │ ├── DictionaryMetadataBuilderTest.java │ │ ├── DictionaryMetadataTest.java │ │ ├── DictionaryTest.java │ │ ├── EncodersTest.java │ │ └── SequenceEncodersTest.java │ └── resources/ │ └── morfologik/ │ └── stemming/ │ ├── escape-separator.info │ ├── test-diacritics-utf8.dict │ ├── test-diacritics-utf8.info │ ├── test-infix.dict │ ├── test-infix.info │ ├── test-prefix.dict │ ├── test-prefix.info │ ├── test-removed-props.dict │ ├── test-removed-props.info │ ├── test-separator-in-lookup.fsa │ ├── test-separator-in-lookup.in │ ├── test-separators.dict │ ├── test-separators.info │ ├── test-separators.txt │ ├── test-synth.dict │ ├── test-synth.info │ └── unicode-separator.info ├── morfologik-tools/ │ ├── pom.xml │ └── src/ │ ├── main/ │ │ ├── assembly/ │ │ │ └── package.xml │ │ ├── java/ │ │ │ └── morfologik/ │ │ │ └── tools/ │ │ │ ├── BinaryInput.java │ │ │ ├── CliTool.java │ │ │ ├── CustomParameterConverters.java │ │ │ ├── DictApply.java │ │ │ ├── DictCompile.java │ │ │ ├── DictDecompile.java │ │ │ ├── ExitStatus.java │ │ │ ├── ExitStatusException.java │ │ │ ├── FSABuild.java │ │ │ ├── FSACompile.java │ │ │ ├── FSADecompile.java │ │ │ ├── FSADump.java │ │ │ ├── FSAInfo.java │ │ │ ├── Launcher.java │ │ │ ├── SerializationFormat.java │ │ │ ├── ValidateFileExists.java │ │ │ └── ValidateParentDirExists.java │ │ └── package/ │ │ ├── README.txt │ │ └── examples/ │ │ ├── 01-fsa-build.input │ │ ├── 01-fsa-build.txt │ │ ├── 02-fsa-dump.txt │ │ ├── 03-fsa-info.txt │ │ ├── 04-dict-compile.info │ │ ├── 04-dict-compile.input │ │ ├── 04-dict-compile.txt │ │ └── 05-dict-decompile.txt │ └── test/ │ └── java/ │ └── morfologik/ │ └── tools/ │ ├── DictCompileBug.java │ ├── DictCompileTest.java │ └── FSACompileTest.java └── pom.xml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/ci.yml ================================================ name: CI on: push: branches: [master] pull_request: jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v6 - name: Set up JDK 21 uses: actions/setup-java@v5 with: distribution: temurin java-version: 21 cache: maven - run: mvn --batch-mode verify ================================================ FILE: .gitignore ================================================ *.versionsBackup tmp/ dist/ target/ *.patch .eclipse/ .project .classpath .settings *.name *.iml .idea/ ================================================ FILE: CHANGES.txt ================================================ Morfologik, Change Log ====================== For an up-to-date CHANGES file see https://github.com/morfologik/morfologik-stemming/blob/master/CHANGES ======================= morfologik-stemming 2.2.0 ======================= Bug Fixes * PR #121: fix bug in replacements: s>ss, ss>s (Jaume Ortolà). * PR #118: fix HMatrix not being reset between calls to Speller.findReplacementCandidates(), causing incorrect candidates to be returned on repeated calls (Jaume Ortolà). * GH-38: support ^ (start) and $ (end) anchors and _ (space) in replacement-pairs, following hunspell REP conventions. * GH-75: Fix incorrect and incomplete CharsetDecoder usage in Speller.findRepl(): missing charBuffer.clear() before decode and missing decoder.flush() after decode, which could produce wrong candidates for stateful encodings. Other Changes * apply spotless (google java format) formatting to sources. * switch to junit5/ jupiter and randomizedtesting-jupiter * Update Maven build plugins to current versions. * Require Java 21 for compiling the project. The output jar remains Java 11 compatible. ======================= morfologik-stemming 2.1.9 ======================= Other Changes * PR #114: improve run-on suggestions for camel case words (Jaume Ortolà) ======================= morfologik-stemming 2.1.8 ======================= Other Changes * GH-112: Add automatic module name to all JARs. * Upgrade selected build dependencies. ======================= morfologik-stemming 2.1.7 ======================= Bug Fixes * PR #103: fix distance value in the result of `Speller.findReplacementCandidates` (Daniel Naber). * GH-102: upgrade jcommander to newest version. (Dawid Weiss) Other Changes * PR #103: introduce `Speller.replaceRunOnWordCandidates()` which returns `CandidateData` (Daniel Naber). ======================= morfologik-stemming 2.1.6 ======================= Other Changes * PR #101: fix replaceRunOnWords() not working for words that are uppercase at sentence start (Daniel Naber). ======================= morfologik-stemming 2.1.5 ======================= Bug Fixes * PR #96: incorrect logic in runOnWords (Jaume Ortolà). * PR #97: micro performance optimization (Daniel Naber). Other Changes * GH-95: Speller: findReplacementCandidates returns full CandidateData. This commit also refactors the Speller to use a stateless returned array list rather than reuse an internal field. Should not make a practical difference. (Dawid Weiss) ======================= morfologik-stemming 2.1.4 ======================= Bug Fixes * PR #93: Case-changed words are always good suggestions (Jaume Ortolà). * GH-92: FSATraversal may return NOT_FOUND instead of AUTOMATON_HAS_PREFIX (stevendolg via Dawid Weiss) Other Changes * Updated build and test plugins to newer versions. ======================= morfologik-stemming 2.1.3 ======================= Bug Fixes * GH-86: Speller: words containing the dictionary separator are not handled properly (Jaume Ortolà via Dawid Weiss). ======================= morfologik-stemming 2.1.2 ======================= Bug Fixes * GH-85: Encoded sequences can clash with separator byte and cause assertion errors. (Daniel Naber, Dawid Weiss). ======================= morfologik-stemming 2.1.1 ======================= Bug Fixes * PR #78: Fix dependency issue in morfologik-speller (Alden Quimby). * GH-84: Dictionary resources not found with security manager. (Uwe Schindler) Other Changes * GH-79: Corrected a corner case in DictCompileTest. (Dawid Weiss) * GH-77: Trailing spaces in encoder name can lead to illegal argument exception. (Jaume Ortolà, Dawid Weiss) ======================= morfologik-stemming 2.1.0 ======================= New Features * GH-74: Add dict_apply tool to apply a dictionary to a file or stdin. (Dawid Weiss) * GH-73: Update Polish stemming dictionaries to polimorfologik 2.1. (Dawid Weiss) Bug Fixes * GH-76: Consolidate and fix character encoding and decoding. (Dawid Weiss) Other Changes * GH-63: BufferUtils.ensureCapacity now clears the input buffer. This also affects WordData methods that accept a reusable byte buffer -- it is now always cleared prior to being flipped and returned. (Dawid Weiss) ======================= morfologik-stemming 2.0.2 ======================= Bug Fixes * GH-68: WordData.clone() should be public. (Dawid Weiss) Other Changes * GH-64: reverted back OSGi annotations (bundle packaging). (Dawid Weiss) * GH-72: Rename tools: fsa_dump to fsa_decompile and fsa_build to fsa_compile. Existing names remain as aliases but will be removed in 2.1.0. (Dawid Weiss) ======================= morfologik-stemming 2.0.1 ======================= Bug Fixes * GH-65: Dictionary.read(URL) ends in NPE when reading from a JAR resource (Dawid Weiss) ======================= morfologik-stemming 2.0.0 ======================= This release comes with a cleanup of the API for Java 1.7. There are several aspects of the code that have been dropped (or added): - NIO is used extensively, mostly for better error reporting. - There is a simplified lookup of resources, no class-relative loading of dictionaries for example. The caller is in charge of looking up either an URL to the dictionary or providing an InputStream to it. - Removed internal caching of dictionaries from Dictionary. The Polish stemmer is initialized lazily and reuses its dictionary internally. - Numerous minor tweaks of parameters. JavaDocs. - A complete rewrite of the tools to compile (and decompile) FSA automata and complete stemming dictionaries. The tools now assert the validity of input data files and ensure no corrupt dictionaries can be produced. Changes in backwards compatibility policy * GH-64: Removed OSGi support because of Maven issues (forks build phases, tests, etc.). * GH-62: Recompress Polish dictionary to use ';' as the separator. (Dawid Weiss) * GH-59: Moved Dictionary.convertText utility to DictionaryLookup.applyReplacements and fixed current reliance on map ordering. (Dawid Weiss) * GH-55: Removed the "distribution" module entirely. The tools module should be self-organizing. Complete overhaul of all the tools. Examples. Simplified syntax, options and assumptions. Input sanity checks and validation. (Dawid Weiss) * GH-57: Restructured the project into FSA traversal/ reading (only) and FSA Builders (construction). This cleans up dependency structure as well (HPPC is not required for FSA traversals). (Dawid Weiss) * GH-54: Make Java 1.7 the minimum required version. Certain methods that relied on File as arguments have been removed or changed to accept Path. (Dawid Weiss) New Features * GH-53: Review library dependencies and bring them up to date. (Dawid Weiss) * Added OSGi support (Michal Hlavac) * GH-51: Remove and fail on deprecated metadata (fsa.dict.uses-*). (Dawid Weiss) Optimizations * GH-61: Refactored the code to use one encoding/ decoding routine and ByteBuffers. Removed dependency on Guava. Bug Fixes * GH-32: make replaceRunOnWords return "a lot" for "alot", etc. (Daniel Naber) * GH-34: ArrayIndexOutOfBoundsException with replacement-pairs. (Jaume Ortolà, Daniel Naber) ======================= morfologik-stemming 1.10.0 ======================= Changes in backwards compatibility policy New Features * Added OSGi support (Michal Hlavac) Bug Fixes * GH-32: make replaceRunOnWords return "a lot" for "alot", etc. (Daniel Naber) * GH-34: ArrayIndexOutOfBoundsException with replacement-pairs. (Jaume Ortolà, Daniel Naber) ======================= morfologik-stemming 1.9.1 ======================= Changes in backwards compatibility policy New Features Bug Fixes * Now only the longest replacement key is selected when using replacement pairs (thanks to Jaume Ortolà). This fixes a subtle regression introduced in 1.9.0. Optimizations ======================= morfologik-stemming 1.9.0 ======================= Changes in backwards compatibility policy New Features * Added capability to normalize input and output strings for dictionaries. This is useful for dictionaries that do not support ligatures, for example. To specify input conversion, use the property 'fsa.dict.input-conversion' in the .info file. The output conversion (for example, to use ligatures) is specified by 'fsa.dict.output-conversion'. Note that lengthy conversion tables may negatively affect performance. Bug Fixes Optimizations * The suggestion search for the speller is now performed directly by traversing the dictionary automaton, which makes it much more time-efficient (thanks to Jaume Ortolà). * Suggestions are generated faster by avoiding unnecessary case conversions. ======================= morfologik-stemming 1.8.3 ======================= Bug Fixes * Fixed a bug for spelling dictionaries in non-UTF encodings with separators: strings with non-encodable characters might have been accepted as spelled correctly even if they were missing in the dictionary. ======================= morfologik-stemming 1.8.2 ======================= New Features * Added the option of using frequencies of words for sorting spelling replacements. It can be used in both spelling and tagging dictionaries. 'fsa.dict.frequency-included=true' must be added to the .info file. For building the dictionary, add at the end of each entry a separator and a character between A and Z (A: less frequently used words; Z: more frequently used words). (Jaume Ortolà) ======================= morfologik-stemming 1.8.1 ======================= Changes in backwards compatibility policy * MorphEncodingTool will *fail* if it detects data/lines that contain the separator annotation byte. This is because such lines get encoded into something that the decoder cannot process. You can use \u0000 as the annotation byte to avoid clashes with any existing data. ======================= morfologik-stemming 1.8.0 ======================= Changes in backwards compatibility policy * Command-line option changes to MorphEncodingTool - it now accepts an explicit name of the sequence encoder, not infix/suffix/prefix booleans. * Updating dependencies to their newest versions. New Features * Dictionary .info files can specify the sequence decoder explicitly: suffix, prefix, infix, none are supported. For backwards compatibility, fsa.dict.uses-prefixes, fsa.dict.uses-infixes and fsa.dict.uses-suffixes are still supported, but will be removed in the next major version. * Command-line option changes to MorphEncodingTool - it now accepts an explicit name of the sequence encoder, not infix/suffix/prefix booleans. * Rewritten implementation of tab-separated data files (tab2morph tool). The output should yield smaller files, especially for prefix encoding and infix encoding. This does *not* necessarily mean smaller automata but we're working on getting these as well. Example output before and after refactoring: Prefix coder: postmodernizm|modernizm|xyz => [before] postmodernizm+ANmodernizm+xyz => [after ] postmodernizm+EA+xyz Infix coder: laquelle|lequel|D f s => [before] laquelle+AAHequel+D f s => [after ] laquelle+AGAquel+D f s * Changed the default format of the Polish dictionary from infix encoded to prefix encoded (smaller output size). Optimizations * A number of internal implementation cleanups and refactorings. ======================= morfologik-stemming 1.7.2 ======================= * A quick fix for incorrect decoding of certain suffixes (long suffixes). * Increased max. recursion level in Speller to 6 from 4. (Jaume Ortolà) ======================= morfologik-stemming 1.7.1 ======================= * Fixed a couple of bugs in morfologik-speller (Jaume Ortolà). ======================= morfologik-stemming 1.7.0 ======================= * Changed DictionaryMetadata API (access methods for encoder/decoder). * Initial version of morfologik-speller component. * Minor changes to the FSADumpTool: the header block is always UTF-8 encoded, the default platform encoding does not matter. This is done to always support certain attributes that may be unicode (and would be incorrectly dumped otherwise). * Metadata *.info files can now be encoded in UTF-8 to support text attributes that otherwise would require text2ascii conversion. ======================= morfologik-stemming 1.6.0 ======================= * Update morfologik-polish data to Morfologik 2.0 PoliMorf (08.03.2013). Deprecated DICTIONARY constants (unified dictionary only). * Important! The format of encoding tags has changed and is now multiple-tags-per-lemma. The value returned from WordData#getTag may be a number of tags concatenated with a "+" character. Previously the same lamma/stem would be returned multiple times, each time with a different tag. * Moving code from SourceForge to github. ======================= morfologik-stemming 1.5.5 ======================= * Made hppc an optional component of morfologik-fsa. It is required for constructing FSA automata only and causes problems with javac. http://stackoverflow.com/questions/3800462/can-i-prevent-javac-accessing-the-class-path-from-the-manifests-of-our-third-par ======================= morfologik-stemming 1.5.4 ======================= * Replaced byte-based speller with CharBasedSpeller. * Warn about UTF-8 files with BOM. * Fixed a typo in package name (speller). ======================= morfologik-stemming 1.5.3 ======================= * Initial release of spelling correction submodule. * Updated morfologik-polish data to morfologik 1.9 [12.06.2012] * Updated morfologik-polish licensing info to BSD (yay). ======================= morfologik-stemming 1.5.2 ======================= * An alternative Polish dictionary added (BSD licensed): SGJP (Morfeusz). PolishStemmer can now take an enum switching between the dictionary to be used or combine both. * Project split into modules. A single jar version (no external dependencies) added by transforming via proguard. * Enabled use of escaped special characters in the tab2morph tool. * Added guards against the input term having separator character somewhere (this will now return an empty list of matches). Added getSeparatorChar to DictionaryLookup so that one can check for this condition manually, if needed. ======================= morfologik-stemming 1.5.1 ======================= * Build system switch to Maven (tested with Maven2). ======================= morfologik-stemming 1.5.0 ======================= * Major size saving improvements in CFSA2. Built in Polish dictionary size decreased from 2,811,345 to 1,806,661 (CFSA2 format). * FSABuilder returns a ready-to-be-used FSA (ConstantArcSizeFSA). Construction overhead for this automaton is a round zero (it is immediately serialized in-memory). * Polish dictionary updated to Morfologik 1.7. [19.11.2010] * Added an option to serialize automaton to CFSA2 or FSA5 directly from fsa_build. * CFSA is now deprecated for serialization (the code still reads CFSA automata, but will no be able to serialize them). Use CFSA2. * Added immediate state interning. Speedup in automaton construction by about 30%, memory use decreased significantly (did not perform exact measurements, but incremental construction from presorted data should consume way less memory). * Added an option to build FSA from already sorted data (--sorted). Avoids in-memory sorting. Pipe the input through shell sort if building FSA from large data. * Changed the default ordering from Java signed-byte to C-like unsigned byte value. This lets one use GNU sort to sort the input using 'export LC_ALL=C; sort input'. * Added traversal routines to calculate perfect hashing based on FSA with NUMBERS. * Changed the order of serialized arcs in the binary serializer for FSA5 to lexicographic (consistent with the input). Depth-first traversal recreates the input, in other words. * Removed character-based automata. * Incompatible API changes to FSA builders (moved to morfologik.fsa). * Incompatible API changes to FSATraversalHelper. Cleaned up match types, added unit tests. * An external dependency HPPC (high performance primitive collections) is now required ======================= morfologik-stemming 1.4.1 ======================= * Upgrade of the built-in Morfologik dictionary for Polish (in CFSA format). * Added options to define custom FILLER and ANNOT_SEPARATOR bytes in the fsa_build tool. * Corrected an inconsistency with the C fsa package -- FILLER and ANNOT_SEPARATOR characters are now identical with the C version. * Cleanups to the tools' launcher -- will complain about missing JARs, if any. ======================= morfologik-stemming 1.4.0 ======================= * Added FSA5 construction in Java (on byte sequences). Added preliminary support for character sequences. Added a command line tool for FSA5 construction from unsorted data (sorting is done in-memory). * Added a tool to encode tab-delimited dictionaries to the format accepted by fsa_build and FSA5 construction tool. * Added a new version of Morfologik dictionary for Polish (in CFSA format). ======================= morfologik-stemming 1.3.0 ======================= * Added runtime checking for tools availability so that unavailable tools don't show up in the list. * Recompressed the built-in Polish dictionary to CFSA. * Cleaned up FSA/Dictionary separation. FSAs don't store encoding any more (because it does not make sense for them to do so). The FSA is a purely abstract class pushing functionality to sub-classes. Input stream reading cleaned up. * Added initial code for CFSA (compressed FSA). Reduces automata size about 10%. * Changes in the public API. Implementation classes renamed (FSAVer5Impl into FSA5). Major tweaks and tunes to the API. * Added support for version 5 automata built with NUMBERS flag (an extra field stored for each node). ======================= morfologik-stemming 1.2.2 ======================= * License switch to plain BSD (removed the patent clause which did not make much sense anyway). * The build ZIP now includes licenses for individual JARs (prevents confusion). ======================= morfologik-stemming 1.2.1 ======================= * Fixed tool launching routines. ======================= morfologik-stemming 1.2.0 ======================= * Package hierarchy reorganized. * Removed stempel (heuristic stemmer for Polish). * Code updated to Java 1.5. * The API has changed in many places (enums instead of constants, generics, iterables, removed explicit Arc and Node classes and replaced by int pointers). * FSA traversal in version 1.2 is implemented on top of primitive data structures (int pointers) to keep memory usage minimal. The speed boost gained from this is enormous and justifies less readable code. We strongly advise to use the provided iterators and helper functions for matching state sequences in the FSA. * Tools updated. Dumping existing FSAs is much, much faster now. ======================= morfologik-stemming 1.1.4 ======================= * Fixed a bug that caused UTF-8 dictionaries to be garbled. Now it should be relatively safe to use UTF-8 dictionaries (note: separators cannot be multibyte UTF-8 characters, yet this is probably a very rare case). ======================= morfologik-stemming 1.1.3 ======================= * Fixed a bug causing NPE when the library is called with null context class loader (happens when JVM is invoked from an JNI-attached thread). Thanks to Patrick Luby for report and detailed analysis. * Updated the built-in dictionary to the newest version available. ======================= morfologik-stemming 1.1.2 ======================= * Fixed a bug causing JAR file locking (by implementing a workaround). * Fixed the build script (manifest file was broken). ======================= morfologik-stemming 1.1.1 ======================= * Distribution script fixes. The final JAR does not contain test classes and resources. Size trimmed almost twice compared to release 1.1. * Updated the dump tool to accept dictionary metadata files. ======================= morfologik-stemming 1.1 ========================= * Introduced an auxiliary "meta" information files about compressed dictionaries. Such information include delimiter symbol, encoding and infix/prefix/postfix decoding info. * The API has changed (repackaging). Some deprecated methods have been removed. This is a major redesign/ upgrade, you will have to adjust your source code. * Cleaned up APIs and interfaces. * Added infrastructure for command-line tool launching. * Cleaned up tests. * Changed project name to morfologik-stemmers and ownership to (c) Morfologik. ======================= morfologik-stemming 1.0.7 ======================= * Removed one bug in fsa 'compression' decoding. ======================= morfologik-stemming 1.0.6 ======================= * Customized version of stempel replaced with a standard distribution. * Removed deprecated methods and classes. * Added infix and prefix encoding support for fsa dictionaries. ======================= morfologik-stemming 1.0.5 ======================= * Added filler and separator char dumps to FSADump. * A major bug in automaton traversal corrected. Upgrade when possible. * Certain API changes were introduced; older methods are now deprecated and will be removed in the future. ======================= morfologik-stemming 1.0.4 ======================= * Licenses for full and no-dict versions. ======================= morfologik-stemming 1.0.3 ======================= * Project code moved to SourceForge (subproject of Morfologik). LICENSE CHANGED FROM PUBLIC DOMAIN TO BSD (doesn't change much, but clarifies legal issues). ======================= morfologik-stemming 1.0.2 ======================= * Added a Lametyzator constructor which allows custom dictionary stream, field delimiters and encoding. Added an option for building stand-alone JAR that does not include the default polish dictionary. ======================= morfologik-stemming 1.0.1 ======================= * Code cleanups. Added a method that returns the third automaton's column (form). ======================= morfologik-stemming 1.0 ========================= * Initial release ================================================ FILE: CONTRIBUTING.txt ================================================ Contributions are welcome! Use a modern Java version for compilation and testing (JDK 21+ recommended). If you use Eclipse, set up project formatting and validation with: mvn -Peclipse ================================================ FILE: LICENSE.txt ================================================ Copyright (c) 2006 Dawid Weiss Copyright (c) 2007-2015 Dawid Weiss, Marcin Miłkowski All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Morfologik nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: README.txt ================================================ MORFOLOGIK ========== Tools for finite state automata construction and dictionary-based morphological dictionaries. Morphosyntactic dictionary for the Polish language. See the following for more information: Wiki: https://github.com/morfologik/morfologik-stemming/wiki Bugs: https://github.com/morfologik/morfologik-stemming/issues See CONTRIBUTING.txt if you'd like to add or change something. See LICENSE.txt to make your company's lawyer happy. See CHANGES.txt for API changes and updates. (c) Marcin Miłkowski, Dawid Weiss ================================================ FILE: etc/eclipse/settings/org.eclipse.jdt.core.prefs ================================================ eclipse.preferences.version=1 org.eclipse.jdt.core.compiler.annotation.inheritNullAnnotations=disabled org.eclipse.jdt.core.compiler.annotation.missingNonNullByDefaultAnnotation=ignore org.eclipse.jdt.core.compiler.annotation.nonnull=org.eclipse.jdt.annotation.NonNull org.eclipse.jdt.core.compiler.annotation.nonnullbydefault=org.eclipse.jdt.annotation.NonNullByDefault org.eclipse.jdt.core.compiler.annotation.nullable=org.eclipse.jdt.annotation.Nullable org.eclipse.jdt.core.compiler.annotation.nullanalysis=disabled org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled org.eclipse.jdt.core.compiler.codegen.methodParameters=do not generate org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve org.eclipse.jdt.core.compiler.compliance=1.7 org.eclipse.jdt.core.compiler.debug.lineNumber=generate org.eclipse.jdt.core.compiler.debug.localVariable=generate org.eclipse.jdt.core.compiler.debug.sourceFile=generate org.eclipse.jdt.core.compiler.doc.comment.support=enabled org.eclipse.jdt.core.compiler.problem.annotationSuperInterface=warning org.eclipse.jdt.core.compiler.problem.assertIdentifier=error org.eclipse.jdt.core.compiler.problem.autoboxing=ignore org.eclipse.jdt.core.compiler.problem.comparingIdentical=warning org.eclipse.jdt.core.compiler.problem.deadCode=warning org.eclipse.jdt.core.compiler.problem.deprecation=warning org.eclipse.jdt.core.compiler.problem.deprecationInDeprecatedCode=disabled org.eclipse.jdt.core.compiler.problem.deprecationWhenOverridingDeprecatedMethod=disabled org.eclipse.jdt.core.compiler.problem.discouragedReference=warning org.eclipse.jdt.core.compiler.problem.emptyStatement=ignore org.eclipse.jdt.core.compiler.problem.enumIdentifier=error org.eclipse.jdt.core.compiler.problem.explicitlyClosedAutoCloseable=ignore org.eclipse.jdt.core.compiler.problem.fallthroughCase=ignore org.eclipse.jdt.core.compiler.problem.fatalOptionalError=disabled org.eclipse.jdt.core.compiler.problem.fieldHiding=ignore org.eclipse.jdt.core.compiler.problem.finalParameterBound=warning org.eclipse.jdt.core.compiler.problem.finallyBlockNotCompletingNormally=warning org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning org.eclipse.jdt.core.compiler.problem.hiddenCatchBlock=warning org.eclipse.jdt.core.compiler.problem.includeNullInfoFromAsserts=disabled org.eclipse.jdt.core.compiler.problem.incompatibleNonInheritedInterfaceMethod=warning org.eclipse.jdt.core.compiler.problem.incompleteEnumSwitch=warning org.eclipse.jdt.core.compiler.problem.indirectStaticAccess=ignore org.eclipse.jdt.core.compiler.problem.invalidJavadoc=error org.eclipse.jdt.core.compiler.problem.invalidJavadocTags=enabled org.eclipse.jdt.core.compiler.problem.invalidJavadocTagsDeprecatedRef=disabled org.eclipse.jdt.core.compiler.problem.invalidJavadocTagsNotVisibleRef=disabled org.eclipse.jdt.core.compiler.problem.invalidJavadocTagsVisibility=protected org.eclipse.jdt.core.compiler.problem.localVariableHiding=ignore org.eclipse.jdt.core.compiler.problem.methodWithConstructorName=warning org.eclipse.jdt.core.compiler.problem.missingDefaultCase=ignore org.eclipse.jdt.core.compiler.problem.missingDeprecatedAnnotation=ignore org.eclipse.jdt.core.compiler.problem.missingEnumCaseDespiteDefault=disabled org.eclipse.jdt.core.compiler.problem.missingHashCodeMethod=ignore org.eclipse.jdt.core.compiler.problem.missingJavadocComments=ignore org.eclipse.jdt.core.compiler.problem.missingJavadocCommentsOverriding=disabled org.eclipse.jdt.core.compiler.problem.missingJavadocCommentsVisibility=public org.eclipse.jdt.core.compiler.problem.missingJavadocTagDescription=return_tag org.eclipse.jdt.core.compiler.problem.missingJavadocTags=error org.eclipse.jdt.core.compiler.problem.missingJavadocTagsMethodTypeParameters=disabled org.eclipse.jdt.core.compiler.problem.missingJavadocTagsOverriding=disabled org.eclipse.jdt.core.compiler.problem.missingJavadocTagsVisibility=protected org.eclipse.jdt.core.compiler.problem.missingOverrideAnnotation=ignore org.eclipse.jdt.core.compiler.problem.missingOverrideAnnotationForInterfaceMethodImplementation=enabled org.eclipse.jdt.core.compiler.problem.missingSerialVersion=warning org.eclipse.jdt.core.compiler.problem.missingSynchronizedOnInheritedMethod=ignore org.eclipse.jdt.core.compiler.problem.noEffectAssignment=warning org.eclipse.jdt.core.compiler.problem.noImplicitStringConversion=warning org.eclipse.jdt.core.compiler.problem.nonExternalizedStringLiteral=ignore org.eclipse.jdt.core.compiler.problem.nonnullParameterAnnotationDropped=warning org.eclipse.jdt.core.compiler.problem.nullAnnotationInferenceConflict=error org.eclipse.jdt.core.compiler.problem.nullReference=warning org.eclipse.jdt.core.compiler.problem.nullSpecViolation=error org.eclipse.jdt.core.compiler.problem.nullUncheckedConversion=warning org.eclipse.jdt.core.compiler.problem.overridingPackageDefaultMethod=warning org.eclipse.jdt.core.compiler.problem.parameterAssignment=ignore org.eclipse.jdt.core.compiler.problem.possibleAccidentalBooleanAssignment=ignore org.eclipse.jdt.core.compiler.problem.potentialNullReference=ignore org.eclipse.jdt.core.compiler.problem.potentiallyUnclosedCloseable=ignore org.eclipse.jdt.core.compiler.problem.rawTypeReference=warning org.eclipse.jdt.core.compiler.problem.redundantNullAnnotation=warning org.eclipse.jdt.core.compiler.problem.redundantNullCheck=ignore org.eclipse.jdt.core.compiler.problem.redundantSpecificationOfTypeArguments=ignore org.eclipse.jdt.core.compiler.problem.redundantSuperinterface=ignore org.eclipse.jdt.core.compiler.problem.reportMethodCanBePotentiallyStatic=ignore org.eclipse.jdt.core.compiler.problem.reportMethodCanBeStatic=ignore org.eclipse.jdt.core.compiler.problem.specialParameterHidingField=disabled org.eclipse.jdt.core.compiler.problem.staticAccessReceiver=warning org.eclipse.jdt.core.compiler.problem.suppressOptionalErrors=disabled org.eclipse.jdt.core.compiler.problem.suppressWarnings=enabled org.eclipse.jdt.core.compiler.problem.syntacticNullAnalysisForFields=disabled org.eclipse.jdt.core.compiler.problem.syntheticAccessEmulation=ignore org.eclipse.jdt.core.compiler.problem.typeParameterHiding=warning org.eclipse.jdt.core.compiler.problem.unavoidableGenericTypeProblems=enabled org.eclipse.jdt.core.compiler.problem.uncheckedTypeOperation=warning org.eclipse.jdt.core.compiler.problem.unclosedCloseable=warning org.eclipse.jdt.core.compiler.problem.undocumentedEmptyBlock=ignore org.eclipse.jdt.core.compiler.problem.unhandledWarningToken=warning org.eclipse.jdt.core.compiler.problem.unnecessaryElse=ignore org.eclipse.jdt.core.compiler.problem.unnecessaryTypeCheck=ignore org.eclipse.jdt.core.compiler.problem.unqualifiedFieldAccess=ignore org.eclipse.jdt.core.compiler.problem.unusedDeclaredThrownException=ignore org.eclipse.jdt.core.compiler.problem.unusedDeclaredThrownExceptionExemptExceptionAndThrowable=enabled org.eclipse.jdt.core.compiler.problem.unusedDeclaredThrownExceptionIncludeDocCommentReference=enabled org.eclipse.jdt.core.compiler.problem.unusedDeclaredThrownExceptionWhenOverriding=disabled org.eclipse.jdt.core.compiler.problem.unusedImport=warning org.eclipse.jdt.core.compiler.problem.unusedLabel=warning org.eclipse.jdt.core.compiler.problem.unusedLocal=warning org.eclipse.jdt.core.compiler.problem.unusedObjectAllocation=ignore org.eclipse.jdt.core.compiler.problem.unusedParameter=ignore org.eclipse.jdt.core.compiler.problem.unusedParameterIncludeDocCommentReference=enabled org.eclipse.jdt.core.compiler.problem.unusedParameterWhenImplementingAbstract=disabled org.eclipse.jdt.core.compiler.problem.unusedParameterWhenOverridingConcrete=disabled org.eclipse.jdt.core.compiler.problem.unusedPrivateMember=warning org.eclipse.jdt.core.compiler.problem.unusedTypeParameter=ignore org.eclipse.jdt.core.compiler.problem.unusedWarningToken=warning org.eclipse.jdt.core.compiler.problem.varargsArgumentNeedCast=warning org.eclipse.jdt.core.compiler.source=1.7 org.eclipse.jdt.core.formatter.align_type_members_on_columns=false org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation=0 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation=16 org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression=16 org.eclipse.jdt.core.formatter.alignment_for_assignment=0 org.eclipse.jdt.core.formatter.alignment_for_binary_expression=16 org.eclipse.jdt.core.formatter.alignment_for_compact_if=16 org.eclipse.jdt.core.formatter.alignment_for_conditional_expression=80 org.eclipse.jdt.core.formatter.alignment_for_enum_constants=0 org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer=16 org.eclipse.jdt.core.formatter.alignment_for_method_declaration=0 org.eclipse.jdt.core.formatter.alignment_for_multiple_fields=16 org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_resources_in_try=80 org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation=16 org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration=16 org.eclipse.jdt.core.formatter.alignment_for_union_type_in_multicatch=16 org.eclipse.jdt.core.formatter.blank_lines_after_imports=1 org.eclipse.jdt.core.formatter.blank_lines_after_package=1 org.eclipse.jdt.core.formatter.blank_lines_before_field=0 org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration=0 org.eclipse.jdt.core.formatter.blank_lines_before_imports=1 org.eclipse.jdt.core.formatter.blank_lines_before_member_type=1 org.eclipse.jdt.core.formatter.blank_lines_before_method=1 org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk=1 org.eclipse.jdt.core.formatter.blank_lines_before_package=0 org.eclipse.jdt.core.formatter.blank_lines_between_import_groups=1 org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations=1 org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_array_initializer=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_block=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_block_in_case=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_enum_constant=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_lambda_body=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_method_declaration=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_switch=end_of_line org.eclipse.jdt.core.formatter.brace_position_for_type_declaration=end_of_line org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment=false org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment=false org.eclipse.jdt.core.formatter.comment.format_block_comments=false org.eclipse.jdt.core.formatter.comment.format_header=false org.eclipse.jdt.core.formatter.comment.format_html=true org.eclipse.jdt.core.formatter.comment.format_javadoc_comments=true org.eclipse.jdt.core.formatter.comment.format_line_comments=false org.eclipse.jdt.core.formatter.comment.format_source_code=true org.eclipse.jdt.core.formatter.comment.indent_parameter_description=true org.eclipse.jdt.core.formatter.comment.indent_root_tags=true org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags=insert org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter=insert org.eclipse.jdt.core.formatter.comment.line_length=80 org.eclipse.jdt.core.formatter.comment.new_lines_at_block_boundaries=true org.eclipse.jdt.core.formatter.comment.new_lines_at_javadoc_boundaries=true org.eclipse.jdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=false org.eclipse.jdt.core.formatter.compact_else_if=true org.eclipse.jdt.core.formatter.continuation_indentation=2 org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer=2 org.eclipse.jdt.core.formatter.disabling_tag=@formatter\:off org.eclipse.jdt.core.formatter.enabling_tag=@formatter\:on org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line=false org.eclipse.jdt.core.formatter.format_line_comment_starting_on_first_column=true org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header=true org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header=true org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header=true org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header=true org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases=true org.eclipse.jdt.core.formatter.indent_empty_lines=false org.eclipse.jdt.core.formatter.indent_statements_compare_to_block=true org.eclipse.jdt.core.formatter.indent_statements_compare_to_body=true org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases=true org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch=true org.eclipse.jdt.core.formatter.indentation.size=2 org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_field=insert org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable=insert org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_method=insert org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_package=insert org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter=do not insert org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_type=insert org.eclipse.jdt.core.formatter.insert_new_line_after_label=do not insert org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert org.eclipse.jdt.core.formatter.insert_new_line_after_type_annotation=do not insert org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing=do not insert org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement=do not insert org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement=do not insert org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement=do not insert org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement=do not insert org.eclipse.jdt.core.formatter.insert_new_line_in_empty_annotation_declaration=insert org.eclipse.jdt.core.formatter.insert_new_line_in_empty_anonymous_type_declaration=insert org.eclipse.jdt.core.formatter.insert_new_line_in_empty_block=insert org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_constant=insert org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_declaration=insert org.eclipse.jdt.core.formatter.insert_new_line_in_empty_method_body=insert org.eclipse.jdt.core.formatter.insert_new_line_in_empty_type_declaration=insert org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter=insert org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator=insert org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation=do not insert org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_after_binary_operator=insert org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments=insert org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters=insert org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block=insert org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast=insert org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert=insert org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case=insert org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional=insert org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for=insert org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments=insert org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters=insert org.eclipse.jdt.core.formatter.insert_space_after_ellipsis=insert org.eclipse.jdt.core.formatter.insert_space_after_lambda_arrow=insert org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=insert org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_try=do not insert org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator=do not insert org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator=do not insert org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional=insert org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard=do not insert org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for=insert org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_try_resources=insert org.eclipse.jdt.core.formatter.insert_space_after_unary_operator=do not insert org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter=insert org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator=insert org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration=insert org.eclipse.jdt.core.formatter.insert_space_before_binary_operator=insert org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=insert org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_try=do not insert org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert=insert org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case=do not insert org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional=insert org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default=do not insert org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for=insert org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments=do not insert org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters=do not insert org.eclipse.jdt.core.formatter.insert_space_before_ellipsis=do not insert org.eclipse.jdt.core.formatter.insert_space_before_lambda_arrow=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_try=insert org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while=insert org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return=insert org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw=insert org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator=do not insert org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator=do not insert org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional=insert org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard=do not insert org.eclipse.jdt.core.formatter.insert_space_before_semicolon=do not insert org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for=do not insert org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_try_resources=do not insert org.eclipse.jdt.core.formatter.insert_space_before_unary_operator=do not insert org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference=do not insert org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression=do not insert org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant=do not insert org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert org.eclipse.jdt.core.formatter.join_lines_in_comments=true org.eclipse.jdt.core.formatter.join_wrapped_lines=true org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line=false org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line=false org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line=false org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line=false org.eclipse.jdt.core.formatter.lineSplit=120 org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column=false org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column=false org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body=0 org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve=1 org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line=true org.eclipse.jdt.core.formatter.tabulation.char=space org.eclipse.jdt.core.formatter.tabulation.size=2 org.eclipse.jdt.core.formatter.use_on_off_tags=true org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations=false org.eclipse.jdt.core.formatter.wrap_before_binary_operator=true org.eclipse.jdt.core.formatter.wrap_before_or_operator_multicatch=true org.eclipse.jdt.core.formatter.wrap_outer_expressions_when_nested=true ================================================ FILE: etc/eclipse/settings/org.eclipse.m2e.core.prefs ================================================ activeProfiles=eclipse eclipse.preferences.version=1 resolveWorkspaceProjects=true version=1 ================================================ FILE: etc/forbidden-apis/signatures.txt ================================================ @defaultMessage Convert to URI java.net.URL#getPath() java.net.URL#getFile() @defaultMessage spawns threads with vague names; use a custom thread factory and name threads so that you can tell (by its name) which executor it is associated with java.util.concurrent.Executors#newFixedThreadPool(int) java.util.concurrent.Executors#newSingleThreadExecutor() java.util.concurrent.Executors#newCachedThreadPool() java.util.concurrent.Executors#newSingleThreadScheduledExecutor() java.util.concurrent.Executors#newScheduledThreadPool(int) java.util.concurrent.Executors#defaultThreadFactory() java.util.concurrent.Executors#privilegedThreadFactory() java.lang.Character#codePointBefore(char[],int) @ Implicit start offset is error-prone when the char[] is a buffer and the first chars are random chars java.lang.Character#codePointAt(char[],int) @ Implicit end offset is error-prone when the char[] is a buffer and the last chars are random chars @defaultMessage Please do not try to stop the world java.lang.System#gc() @defaultMessage Use Channels.* methods to write to channels. Do not write directly. java.nio.channels.WritableByteChannel#write(java.nio.ByteBuffer) java.nio.channels.FileChannel#write(java.nio.ByteBuffer, long) java.nio.channels.GatheringByteChannel#write(java.nio.ByteBuffer[], int, int) java.nio.channels.GatheringByteChannel#write(java.nio.ByteBuffer[]) java.nio.channels.ReadableByteChannel#read(java.nio.ByteBuffer) java.nio.channels.ScatteringByteChannel#read(java.nio.ByteBuffer[]) java.nio.channels.ScatteringByteChannel#read(java.nio.ByteBuffer[], int, int) java.nio.channels.FileChannel#read(java.nio.ByteBuffer, long) @defaultMessage Filters are trappy (add suppression or make sure all read methods are redelegated). java.io.FilterInputStream#(java.io.InputStream) java.io.FilterOutputStream#(java.io.OutputStream) java.io.FilterReader#(java.io.Reader) java.io.FilterWriter#(java.io.Writer) #@defaultMessage Do not use context class loaders, prefer explicit ClassLoader argument. java.lang.Thread@getContextClassLoader() java.lang.Thread@setContextClassLoader() ================================================ FILE: morfologik-fsa/pom.xml ================================================ 4.0.0 org.carrot2 morfologik-parent 2.2.0-SNAPSHOT ../pom.xml morfologik-fsa bundle Morfologik FSA (Traversal) Morfologik Finite State Automata Traversal. ../etc/forbidden-apis/signatures.txt org.carrot2.morfologik.fsa org.apache.felix maven-bundle-plugin morfologik.fsa * ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/ByteSequenceIterator.java ================================================ package morfologik.fsa; import java.nio.ByteBuffer; import java.util.*; /** * An iterator that traverses the right language of a given node (all sequences reachable from a * given node). */ public final class ByteSequenceIterator implements Iterator { /** * Default expected depth of the recursion stack (estimated longest sequence in the automaton). * Buffers expand by the same value if exceeded. */ private static final int EXPECTED_MAX_STATES = 15; /** The FSA to which this iterator belongs. */ private final FSA fsa; /** An internal cache for the next element in the FSA */ private ByteBuffer nextElement; /** A buffer for the current sequence of bytes from the current node to the root. */ private byte[] buffer = new byte[EXPECTED_MAX_STATES]; /** Reusable byte buffer wrapper around {@link #buffer}. */ private ByteBuffer bufferWrapper = ByteBuffer.wrap(buffer); /** An arc stack for DFS when processing the automaton. */ private int[] arcs = new int[EXPECTED_MAX_STATES]; /** Current processing depth in {@link #arcs}. */ private int position; /** * Create an instance of the iterator iterating over all automaton sequences. * * @param fsa The automaton to iterate over. */ public ByteSequenceIterator(FSA fsa) { this(fsa, fsa.getRootNode()); } /** * Create an instance of the iterator for a given node. * * @param fsa The automaton to iterate over. * @param node The starting node's identifier (can be the {@link FSA#getRootNode()}). */ public ByteSequenceIterator(FSA fsa, int node) { this.fsa = fsa; if (fsa.getFirstArc(node) != 0) { restartFrom(node); } } /** * Restart walking from node. Allows iterator reuse. * * @param node Restart the iterator from node. * @return Returns this for call chaining. */ public ByteSequenceIterator restartFrom(int node) { position = 0; bufferWrapper.clear(); nextElement = null; pushNode(node); return this; } /** Returns true if there are still elements in this iterator. */ @Override public boolean hasNext() { if (nextElement == null) { nextElement = advance(); } return nextElement != null; } /** * @return Returns a {@link ByteBuffer} with the sequence corresponding to the next final state in * the automaton. */ @Override public ByteBuffer next() { if (nextElement != null) { final ByteBuffer cache = nextElement; nextElement = null; return cache; } else { final ByteBuffer cache = advance(); if (cache == null) { throw new NoSuchElementException(); } return cache; } } /** Advances to the next available final state. */ private final ByteBuffer advance() { if (position == 0) { return null; } while (position > 0) { final int lastIndex = position - 1; final int arc = arcs[lastIndex]; if (arc == 0) { // Remove the current node from the queue. position--; continue; } // Go to the next arc, but leave it on the stack // so that we keep the recursion depth level accurate. arcs[lastIndex] = fsa.getNextArc(arc); // Expand buffer if needed. final int bufferLength = this.buffer.length; if (lastIndex >= bufferLength) { this.buffer = Arrays.copyOf(buffer, bufferLength + EXPECTED_MAX_STATES); this.bufferWrapper = ByteBuffer.wrap(buffer); } buffer[lastIndex] = fsa.getArcLabel(arc); if (!fsa.isArcTerminal(arc)) { // Recursively descend into the arc's node. pushNode(fsa.getEndNode(arc)); } if (fsa.isArcFinal(arc)) { bufferWrapper.clear(); bufferWrapper.limit(lastIndex + 1); return bufferWrapper; } } return null; } /** Not implemented in this iterator. */ @Override public void remove() { throw new UnsupportedOperationException("Read-only iterator."); } /** Descends to a given node, adds its arcs to the stack to be traversed. */ private void pushNode(int node) { // Expand buffers if needed. if (position == arcs.length) { arcs = Arrays.copyOf(arcs, arcs.length + EXPECTED_MAX_STATES); } arcs[position++] = fsa.getFirstArc(node); } } ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/CFSA.java ================================================ package morfologik.fsa; import static morfologik.fsa.FSAFlags.*; import java.io.*; import java.util.*; /** * CFSA (Compact Finite State Automaton) binary format implementation. This is a slightly * reorganized version of {@link FSA5} offering smaller automata size at some (minor) performance * penalty. * *

Note: Serialize to {@link CFSA2} for new code. * *

The encoding of automaton body is as follows. * *

 * ---- FSA header (standard)
 * Byte                            Description
 *       +-+-+-+-+-+-+-+-+\
 *     0 | | | | | | | | | +------ '\'
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     1 | | | | | | | | | +------ 'f'
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     2 | | | | | | | | | +------ 's'
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     3 | | | | | | | | | +------ 'a'
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     4 | | | | | | | | | +------ version (fixed 0xc5)
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     5 | | | | | | | | | +------ filler character
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     6 | | | | | | | | | +------ annot character
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     7 |C|C|C|C|G|G|G|G| +------ C - node data size (ctl), G - address size (gotoLength)
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *  8-32 | | | | | | | | | +------ labels mapped for type (1) of arc encoding.
 *       : : : : : : : : : |
 *       +-+-+-+-+-+-+-+-+/
 *
 * ---- Start of a node; only if automaton was compiled with NUMBERS option.
 *
 * Byte
 *        +-+-+-+-+-+-+-+-+\
 *      0 | | | | | | | | | \  LSB
 *        +-+-+-+-+-+-+-+-+  +
 *      1 | | | | | | | | |  |      number of strings recognized
 *        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
 *        : : : : : : : : :  |      from this node.
 *        +-+-+-+-+-+-+-+-+  +
 *  ctl-1 | | | | | | | | | /  MSB
 *        +-+-+-+-+-+-+-+-+/
 *
 * ---- A vector of node's arcs. Conditional format, depending on flags.
 *
 * 1) NEXT bit set, mapped arc label.
 *
 *                +--------------- arc's label mapped in M bits if M's field value > 0
 *                | +------------- node pointed to is next
 *                | | +----------- the last arc of the node
 *         _______| | | +--------- the arc is final
 *        /       | | | |
 *       +-+-+-+-+-+-+-+-+\
 *     0 |M|M|M|M|M|1|L|F| +------ flags + (M) index of the mapped label.
 *       +-+-+-+-+-+-+-+-+/
 *
 * 2) NEXT bit set, label separate.
 *
 *                +--------------- arc's label stored separately (M's field is zero).
 *                | +------------- node pointed to is next
 *                | | +----------- the last arc of the node
 *                | | | +--------- the arc is final
 *                | | | |
 *       +-+-+-+-+-+-+-+-+\
 *     0 |0|0|0|0|0|1|L|F| +------ flags
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     1 | | | | | | | | | +------ label
 *       +-+-+-+-+-+-+-+-+/
 *
 * 3) NEXT bit not set. Full arc.
 *
 *                  +------------- node pointed to is next
 *                  | +----------- the last arc of the node
 *                  | | +--------- the arc is final
 *                  | | |
 *       +-+-+-+-+-+-+-+-+\
 *     0 |A|A|A|A|A|0|L|F| +------ flags + (A) address field, lower bits
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     1 | | | | | | | | | +------ label
 *       +-+-+-+-+-+-+-+-+/
 *       : : : : : : : : :
 *       +-+-+-+-+-+-+-+-+\
 * gtl-1 |A|A|A|A|A|A|A|A| +------ address, continuation (MSB)
 *       +-+-+-+-+-+-+-+-+/
 * 
*/ public final class CFSA extends FSA { /** Automaton header version value. */ public static final byte VERSION = (byte) 0xC5; /** * Bitmask indicating that an arc corresponds to the last character of a sequence available when * building the automaton. */ public static final int BIT_FINAL_ARC = 1 << 0; /** * Bitmask indicating that an arc is the last one of the node's list and the following one belongs * to another node. */ public static final int BIT_LAST_ARC = 1 << 1; /** * Bitmask indicating that the target node of this arc follows it in the compressed automaton * structure (no goto field). */ public static final int BIT_TARGET_NEXT = 1 << 2; /** * An array of bytes with the internal representation of the automaton. Please see the * documentation of this class for more information on how this structure is organized. */ public byte[] arcs; /** * The length of the node header structure (if the automaton was compiled with NUMBERS * option). Otherwise zero. */ public final int nodeDataLength; /** Flags for this automaton version. */ private final Set flags; /** Number of bytes each address takes in full, expanded form (goto length). */ public final int gtl; /** * Label mapping for arcs of type (1) (see class documentation). The array is indexed by mapped * label's value and contains the original label. */ public final byte[] labelMapping; /** Creates a new automaton, reading it from a file in FSA format, version 5. */ CFSA(InputStream stream) throws IOException { DataInputStream in = new DataInputStream(stream); // Skip legacy header fields. in.readByte(); // filler in.readByte(); // annotation final byte hgtl = in.readByte(); /* * Determine if the automaton was compiled with NUMBERS. If so, modify * ctl and goto fields accordingly. */ flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT); if ((hgtl & 0xf0) != 0) { this.nodeDataLength = (hgtl >>> 4) & 0x0f; this.gtl = hgtl & 0x0f; flags.add(NUMBERS); } else { this.nodeDataLength = 0; this.gtl = hgtl & 0x0f; } /* * Read mapping dictionary. */ labelMapping = new byte[1 << 5]; in.readFully(labelMapping); /* * Read arcs' data. */ arcs = readRemaining(in); } /** * Returns the start node of this automaton. May return 0 if the start node is also * an end node. */ @Override public int getRootNode() { // Skip dummy node marking terminating state. final int epsilonNode = skipArc(getFirstArc(0)); // And follow the epsilon node's first (and only) arc. return getDestinationNodeOffset(getFirstArc(epsilonNode)); } /** {@inheritDoc} */ @Override public final int getFirstArc(int node) { return nodeDataLength + node; } /** {@inheritDoc} */ @Override public final int getNextArc(int arc) { if (isArcLast(arc)) return 0; else return skipArc(arc); } /** {@inheritDoc} */ @Override public int getArc(int node, byte label) { for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { if (getArcLabel(arc) == label) return arc; } // An arc labeled with "label" not found. return 0; } /** {@inheritDoc} */ @Override public int getEndNode(int arc) { final int nodeOffset = getDestinationNodeOffset(arc); if (0 == nodeOffset) { throw new RuntimeException("This is a terminal arc [" + arc + "]"); } return nodeOffset; } /** {@inheritDoc} */ @Override public byte getArcLabel(int arc) { if (isNextSet(arc) && isLabelCompressed(arc)) { return this.labelMapping[(arcs[arc] >>> 3) & 0x1f]; } else { return arcs[arc + 1]; } } /** {@inheritDoc} */ @Override public int getRightLanguageCount(int node) { assert getFlags().contains(FSAFlags.NUMBERS) : "This FSA was not compiled with NUMBERS."; return FSA5.decodeFromBytes(arcs, node, nodeDataLength); } /** {@inheritDoc} */ @Override public boolean isArcFinal(int arc) { return (arcs[arc] & BIT_FINAL_ARC) != 0; } /** {@inheritDoc} */ @Override public boolean isArcTerminal(int arc) { return (0 == getDestinationNodeOffset(arc)); } /** * Returns true if this arc has NEXT bit set. * * @see #BIT_LAST_ARC * @param arc The node's arc identifier. * @return Returns true if the argument is the last arc of a node. */ public boolean isArcLast(int arc) { return (arcs[arc] & BIT_LAST_ARC) != 0; } /** * @see #BIT_TARGET_NEXT * @param arc The node's arc identifier. * @return Returns true if {@link #BIT_TARGET_NEXT} is set for this arc. */ public boolean isNextSet(int arc) { return (arcs[arc] & BIT_TARGET_NEXT) != 0; } /** * @param arc The node's arc identifier. * @return Returns true if the label is compressed inside flags byte. */ public boolean isLabelCompressed(int arc) { assert isNextSet(arc) : "Only applicable to arcs with NEXT bit."; return (arcs[arc] & (-1 << 3)) != 0; } /** * {@inheritDoc} * *

For this automaton version, an additional {@link FSAFlags#NUMBERS} flag may be set to * indicate the automaton contains extra fields for each node. */ public Set getFlags() { return Collections.unmodifiableSet(flags); } /** Returns the address of the node pointed to by this arc. */ final int getDestinationNodeOffset(int arc) { if (isNextSet(arc)) { /* The destination node follows this arc in the array. */ return skipArc(arc); } else { /* * The destination node address has to be extracted from the arc's * goto field. */ int r = 0; for (int i = gtl; --i >= 1; ) { r = r << 8 | (arcs[arc + 1 + i] & 0xff); } r = r << 8 | (arcs[arc] & 0xff); return r >>> 3; } } /** Read the arc's layout and skip as many bytes, as needed, to skip it. */ private int skipArc(int offset) { if (isNextSet(offset)) { if (isLabelCompressed(offset)) { offset++; } else { offset += 1 + 1; } } else { offset += 1 + gtl; } return offset; } } ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/CFSA2.java ================================================ package morfologik.fsa; import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; import java.util.EnumSet; import java.util.Set; /** * CFSA (Compact Finite State Automaton) binary format implementation, version 2: * *

    *
  • {@link #BIT_TARGET_NEXT} applicable on all arcs, not necessarily the last one. *
  • v-coded goto field *
  • v-coded perfect hashing numbers, if any *
  • 31 most frequent labels integrated with flags byte *
* *

The encoding of automaton body is as follows. * *

 * ---- CFSA header
 * Byte                            Description
 *       +-+-+-+-+-+-+-+-+\
 *     0 | | | | | | | | | +------ '\'
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     1 | | | | | | | | | +------ 'f'
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     2 | | | | | | | | | +------ 's'
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     3 | | | | | | | | | +------ 'a'
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     4 | | | | | | | | | +------ version (fixed 0xc6)
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     5 | | | | | | | | | +----\
 *       +-+-+-+-+-+-+-+-+/      \ flags [MSB first]
 *       +-+-+-+-+-+-+-+-+\      /
 *     6 | | | | | | | | | +----/
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     7 | | | | | | | | | +------ label lookup table size
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *  8-32 | | | | | | | | | +------ label value lookup table
 *       : : : : : : : : : |
 *       +-+-+-+-+-+-+-+-+/
 *
 * ---- Start of a node; only if automaton was compiled with NUMBERS option.
 *
 * Byte
 *        +-+-+-+-+-+-+-+-+\
 *      0 | | | | | | | | | \
 *        +-+-+-+-+-+-+-+-+  +
 *      1 | | | | | | | | |  |      number of strings recognized
 *        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
 *        : : : : : : : : :  |      from this node. v-coding
 *        +-+-+-+-+-+-+-+-+  +
 *        | | | | | | | | | /
 *        +-+-+-+-+-+-+-+-+/
 *
 * ---- A vector of this node's arcs. An arc's layout depends on the combination of flags.
 *
 * 1) NEXT bit set, mapped arc label.
 *
 *        +----------------------- node pointed to is next
 *        | +--------------------- the last arc of the node
 *        | | +------------------- this arc leads to a final state (acceptor)
 *        | | |  _______+--------- arc's label; indexed if M > 0, otherwise explicit label follows
 *        | | | / | | | |
 *       +-+-+-+-+-+-+-+-+\
 *     0 |N|L|F|M|M|M|M|M| +------ flags + (M) index of the mapped label.
 *       +-+-+-+-+-+-+-+-+/
 *       +-+-+-+-+-+-+-+-+\
 *     1 | | | | | | | | | +------ optional label if M == 0
 *       +-+-+-+-+-+-+-+-+/
 *       : : : : : : : : :
 *       +-+-+-+-+-+-+-+-+\
 *       |A|A|A|A|A|A|A|A| +------ v-coded goto address
 *       +-+-+-+-+-+-+-+-+/
 * 
*/ public final class CFSA2 extends FSA { /** Automaton header version value. */ public static final byte VERSION = (byte) 0xc6; /** The target node of this arc follows the last arc of the current state (no goto field). */ public static final int BIT_TARGET_NEXT = 1 << 7; /** The arc is the last one from the current node's arcs list. */ public static final int BIT_LAST_ARC = 1 << 6; /** * The arc corresponds to the last character of a sequence available when building the automaton * (acceptor transition). */ public static final int BIT_FINAL_ARC = 1 << 5; /** The count of bits assigned to storing an indexed label. */ static final int LABEL_INDEX_BITS = 5; /** Masks only the M bits of a flag byte. */ static final int LABEL_INDEX_MASK = (1 << LABEL_INDEX_BITS) - 1; /** Maximum size of the labels index. */ public static final int LABEL_INDEX_SIZE = (1 << LABEL_INDEX_BITS) - 1; /** * An array of bytes with the internal representation of the automaton. Please see the * documentation of this class for more information on how this structure is organized. */ public byte[] arcs; /** Flags for this automaton version. */ private final EnumSet flags; /** Label mapping for M-indexed labels. */ public final byte[] labelMapping; /** If true states are prepended with numbers. */ private final boolean hasNumbers; /** Epsilon node's offset. */ private final int epsilon = 0; /** Reads an automaton from a byte stream. */ CFSA2(InputStream stream) throws IOException { DataInputStream in = new DataInputStream(stream); // Read flags. short flagBits = in.readShort(); flags = EnumSet.noneOf(FSAFlags.class); for (FSAFlags f : FSAFlags.values()) { if (f.isSet(flagBits)) { flags.add(f); } } if (flagBits != FSAFlags.asShort(flags)) { throw new IOException("Unrecognized flags: 0x" + Integer.toHexString(flagBits)); } this.hasNumbers = flags.contains(FSAFlags.NUMBERS); /* * Read mapping dictionary. */ int labelMappingSize = in.readByte() & 0xff; labelMapping = new byte[labelMappingSize]; in.readFully(labelMapping); /* * Read arcs' data. */ arcs = readRemaining(in); } /** {@inheritDoc} */ @Override public int getRootNode() { // Skip dummy node marking terminating state. return getDestinationNodeOffset(getFirstArc(epsilon)); } /** {@inheritDoc} */ @Override public final int getFirstArc(int node) { if (hasNumbers) { return skipVInt(node); } else { return node; } } /** {@inheritDoc} */ @Override public final int getNextArc(int arc) { if (isArcLast(arc)) { return 0; } else { return skipArc(arc); } } /** {@inheritDoc} */ @Override public int getArc(int node, byte label) { for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { if (getArcLabel(arc) == label) { return arc; } } // An arc labeled with "label" not found. return 0; } /** {@inheritDoc} */ @Override public int getEndNode(int arc) { final int nodeOffset = getDestinationNodeOffset(arc); assert nodeOffset != 0 : "Can't follow a terminal arc: " + arc; assert nodeOffset < arcs.length : "Node out of bounds."; return nodeOffset; } /** {@inheritDoc} */ @Override public byte getArcLabel(int arc) { int index = arcs[arc] & LABEL_INDEX_MASK; if (index > 0) { return this.labelMapping[index]; } else { return arcs[arc + 1]; } } /** {@inheritDoc} */ @Override public int getRightLanguageCount(int node) { assert getFlags().contains(FSAFlags.NUMBERS) : "This FSA was not compiled with NUMBERS."; return readVInt(arcs, node); } /** {@inheritDoc} */ @Override public boolean isArcFinal(int arc) { return (arcs[arc] & BIT_FINAL_ARC) != 0; } /** {@inheritDoc} */ @Override public boolean isArcTerminal(int arc) { return (0 == getDestinationNodeOffset(arc)); } /** * Returns true if this arc has NEXT bit set. * * @see #BIT_LAST_ARC * @param arc The node's arc identifier. * @return Returns true if the argument is the last arc of a node. */ public boolean isArcLast(int arc) { return (arcs[arc] & BIT_LAST_ARC) != 0; } /** * @see #BIT_TARGET_NEXT * @param arc The node's arc identifier. * @return Returns true if {@link #BIT_TARGET_NEXT} is set for this arc. */ public boolean isNextSet(int arc) { return (arcs[arc] & BIT_TARGET_NEXT) != 0; } /** {@inheritDoc} */ public Set getFlags() { return flags; } /** Returns the address of the node pointed to by this arc. */ final int getDestinationNodeOffset(int arc) { if (isNextSet(arc)) { /* Follow until the last arc of this state. */ while (!isArcLast(arc)) { arc = getNextArc(arc); } /* And return the byte right after it. */ return skipArc(arc); } else { /* * The destination node address is v-coded. v-code starts either * at the next byte (label indexed) or after the next byte (label explicit). */ return readVInt(arcs, arc + ((arcs[arc] & LABEL_INDEX_MASK) == 0 ? 2 : 1)); } } /** Read the arc's layout and skip as many bytes, as needed, to skip it. */ private int skipArc(int offset) { int flag = arcs[offset++]; // Explicit label? if ((flag & LABEL_INDEX_MASK) == 0) { offset++; } // Explicit goto? if ((flag & BIT_TARGET_NEXT) == 0) { offset = skipVInt(offset); } assert offset < this.arcs.length; return offset; } /** Read a v-int. */ static int readVInt(byte[] array, int offset) { byte b = array[offset]; int value = b & 0x7F; for (int shift = 7; b < 0; shift += 7) { b = array[++offset]; value |= (b & 0x7F) << shift; } return value; } /** Return the byte-length of a v-coded int. */ static int vIntLength(int value) { assert value >= 0 : "Can't v-code negative ints."; int bytes; for (bytes = 1; value >= 0x80; bytes++) { value >>= 7; } return bytes; } /** Skip a v-int. */ private int skipVInt(int offset) { while (arcs[offset++] < 0) { // Do nothing. } return offset; } } ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/FSA.java ================================================ package morfologik.fsa; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.nio.ByteBuffer; import java.util.BitSet; import java.util.Collections; import java.util.Iterator; import java.util.Locale; import java.util.Set; /** * This is a top abstract class for handling finite state automata. These automata are arc-based, a * design described in Jan Daciuk's Incremental Construction of Finite-State Automata and * Transducers, and Their Use in the Natural Language Processing (PhD thesis, Technical * University of Gdansk). */ public abstract class FSA implements Iterable { /** * @return Returns the identifier of the root node of this automaton. Returns 0 if the start node * is also the end node (the automaton is empty). */ public abstract int getRootNode(); /** * @param node Identifier of the node. * @return Returns the identifier of the first arc leaving node or 0 if the node has * no outgoing arcs. */ public abstract int getFirstArc(int node); /** * @param arc The arc's identifier. * @return Returns the identifier of the next arc after arc and leaving node * . Zero is returned if no more arcs are available for the node. */ public abstract int getNextArc(int arc); /** * @param node Identifier of the node. * @param label The arc's label. * @return Returns the identifier of an arc leaving node and labeled with label * . An identifier equal to 0 means the node has no outgoing arc labeled label * . */ public abstract int getArc(int node, byte label); /** * @param arc The arc's identifier. * @return Return the label associated with a given arc. */ public abstract byte getArcLabel(int arc); /** * @param arc The arc's identifier. * @return Returns true if the destination node at the end of this arc * corresponds to an input sequence created when building this automaton. */ public abstract boolean isArcFinal(int arc); /** * @param arc The arc's identifier. * @return Returns true if this arc does not have a terminating node * (@link {@link #getEndNode(int)} will throw an exception). Implies {@link #isArcFinal(int)}. */ public abstract boolean isArcTerminal(int arc); /** * @param arc The arc's identifier. * @return Return the end node pointed to by a given arc. Terminal arcs (those that * point to a terminal state) have no end node representation and throw a runtime exception. */ public abstract int getEndNode(int arc); /** * @return Returns a set of flags for this FSA instance. */ public abstract Set getFlags(); /** * @param node Identifier of the node. * @return Calculates and returns the number of arcs of a given node. */ public int getArcCount(int node) { int count = 0; for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { count++; } return count; } /** * @param node Identifier of the node. * @return Returns the number of sequences reachable from the given state if the automaton was * compiled with {@link FSAFlags#NUMBERS}. The size of the right language of the state, in * other words. * @throws UnsupportedOperationException If the automaton was not compiled with {@link * FSAFlags#NUMBERS}. The value can then be computed by manual count of {@link #getSequences}. */ public int getRightLanguageCount(int node) { throw new UnsupportedOperationException("Automaton not compiled with " + FSAFlags.NUMBERS); } /** * Returns an iterator over all binary sequences starting at the given FSA state (node) and ending * in final nodes. This corresponds to a set of suffixes of a given prefix from all sequences * stored in the automaton. * *

The returned iterator is a {@link ByteBuffer} whose contents changes on each call to {@link * Iterator#next()}. The keep the contents between calls to {@link Iterator#next()}, one must copy * the buffer to some other location. * *

Important. It is guaranteed that the returned byte buffer is backed by a byte array * and that the content of the byte buffer starts at the array's index 0. * * @param node Identifier of the starting node from which to return subsequences. * @return An iterable over all sequences encoded starting at the given node. */ public Iterable getSequences(final int node) { if (node == 0) { return Collections.emptyList(); } return new Iterable() { public Iterator iterator() { return new ByteSequenceIterator(FSA.this, node); } }; } /** * An alias of calling {@link #iterator} directly ({@link FSA} is also {@link Iterable}). * * @return Returns all sequences encoded in the automaton. */ public final Iterable getSequences() { return getSequences(getRootNode()); } /** * Returns an iterator over all binary sequences starting from the initial FSA state (node) and * ending in final nodes. The returned iterator is a {@link ByteBuffer} whose contents changes on * each call to {@link Iterator#next()}. The keep the contents between calls to {@link * Iterator#next()}, one must copy the buffer to some other location. * *

Important. It is guaranteed that the returned byte buffer is backed by a byte array * and that the content of the byte buffer starts at the array's index 0. */ public final Iterator iterator() { return getSequences().iterator(); } /** * Visit all states. The order of visiting is undefined. This method may be faster than traversing * the automaton in post or preorder since it can scan states linearly. Returning false from * {@link StateVisitor#accept(int)} immediately terminates the traversal. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @return Returns the argument (for access to anonymous class fields). */ public T visitAllStates(T v) { return visitInPostOrder(v); } /** * Same as {@link #visitInPostOrder(StateVisitor, int)}, starting from root automaton node. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @return Returns the argument (for access to anonymous class fields). */ public T visitInPostOrder(T v) { return visitInPostOrder(v, getRootNode()); } /** * Visits all states reachable from node in postorder. Returning false from {@link * StateVisitor#accept(int)} immediately terminates the traversal. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @param node Identifier of the node. * @return Returns the argument (for access to anonymous class fields). */ public T visitInPostOrder(T v, int node) { visitInPostOrder(v, node, new BitSet()); return v; } /** Private recursion. */ private boolean visitInPostOrder(StateVisitor v, int node, BitSet visited) { if (visited.get(node)) return true; visited.set(node); for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { if (!isArcTerminal(arc)) { if (!visitInPostOrder(v, getEndNode(arc), visited)) return false; } } return v.accept(node); } /** * Same as {@link #visitInPreOrder(StateVisitor, int)}, starting from root automaton node. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @return Returns the argument (for access to anonymous class fields). */ public T visitInPreOrder(T v) { return visitInPreOrder(v, getRootNode()); } /** * Visits all states in preorder. Returning false from {@link StateVisitor#accept(int)} skips * traversal of all sub-states of a given state. * * @param v Visitor to receive traversal calls. * @param A subclass of {@link StateVisitor}. * @param node Identifier of the node. * @return Returns the argument (for access to anonymous class fields). */ public T visitInPreOrder(T v, int node) { visitInPreOrder(v, node, new BitSet()); return v; } /** * @param in The input stream. * @return Reads all remaining bytes from an input stream and returns them as a byte array. * @throws IOException Rethrown if an I/O exception occurs. */ protected static final byte[] readRemaining(InputStream in) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); byte[] buffer = new byte[1024 * 8]; int len; while ((len = in.read(buffer)) >= 0) { baos.write(buffer, 0, len); } return baos.toByteArray(); } /** Private recursion. */ private void visitInPreOrder(StateVisitor v, int node, BitSet visited) { if (visited.get(node)) { return; } visited.set(node); if (v.accept(node)) { for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { if (!isArcTerminal(arc)) { visitInPreOrder(v, getEndNode(arc), visited); } } } } /** * A factory for reading automata in any of the supported versions. * * @param stream The input stream to read automaton data from. The stream is not closed. * @return Returns an instantiated automaton. Never null. * @throws IOException If the input stream does not represent an automaton or is otherwise * invalid. */ public static FSA read(InputStream stream) throws IOException { final FSAHeader header = FSAHeader.read(stream); switch (header.version) { case FSA5.VERSION: return new FSA5(stream); case CFSA.VERSION: return new CFSA(stream); case CFSA2.VERSION: return new CFSA2(stream); default: throw new IOException( String.format( Locale.ROOT, "Unsupported automaton version: 0x%02x", header.version & 0xFF)); } } /** * A factory for reading a specific FSA subclass, including proper casting. * * @param stream The input stream to read automaton data from. The stream is not closed. * @param clazz A subclass of {@link FSA} to cast the read automaton to. * @param A subclass of {@link FSA} to cast the read automaton to. * @return Returns an instantiated automaton. Never null. * @throws IOException If the input stream does not represent an automaton, is otherwise invalid * or the class of the automaton read from the input stream is not assignable to clazz * . */ public static T read(InputStream stream, Class clazz) throws IOException { FSA fsa = read(stream); if (!clazz.isInstance(fsa)) { throw new IOException( String.format( Locale.ROOT, "Expected FSA type %s, but read an incompatible type %s.", clazz.getName(), fsa.getClass().getName())); } return clazz.cast(fsa); } } ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/FSA5.java ================================================ package morfologik.fsa; import static morfologik.fsa.FSAFlags.*; import java.io.DataInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Collections; import java.util.EnumSet; import java.util.Set; /** * FSA binary format implementation for version 5. * *

Version 5 indicates the dictionary was built with these flags: {@link FSAFlags#FLEXIBLE}, * {@link FSAFlags#STOPBIT} and {@link FSAFlags#NEXTBIT}. The internal representation of the FSA * must therefore follow this description (please note this format describes only a single * transition (arc), not the entire dictionary file). * *

 * ---- this node header present only if automaton was compiled with NUMBERS option.
 * Byte
 *        +-+-+-+-+-+-+-+-+\
 *      0 | | | | | | | | | \  LSB
 *        +-+-+-+-+-+-+-+-+  +
 *      1 | | | | | | | | |  |      number of strings recognized
 *        +-+-+-+-+-+-+-+-+  +----- by the automaton starting
 *        : : : : : : : : :  |      from this node.
 *        +-+-+-+-+-+-+-+-+  +
 *  ctl-1 | | | | | | | | | /  MSB
 *        +-+-+-+-+-+-+-+-+/
 *
 * ---- remaining part of the node
 *
 * Byte
 *       +-+-+-+-+-+-+-+-+\
 *     0 | | | | | | | | | +------ label
 *       +-+-+-+-+-+-+-+-+/
 *
 *                  +------------- node pointed to is next
 *                  | +----------- the last arc of the node
 *                  | | +--------- the arc is final
 *                  | | |
 *             +-----------+
 *             |    | | |  |
 *         ___+___  | | |  |
 *        /       \ | | |  |
 *       MSB           LSB |
 *        7 6 5 4 3 2 1 0  |
 *       +-+-+-+-+-+-+-+-+ |
 *     1 | | | | | | | | | \ \
 *       +-+-+-+-+-+-+-+-+  \ \  LSB
 *       +-+-+-+-+-+-+-+-+     +
 *     2 | | | | | | | | |     |
 *       +-+-+-+-+-+-+-+-+     |
 *     3 | | | | | | | | |     +----- target node address (in bytes)
 *       +-+-+-+-+-+-+-+-+     |      (not present except for the byte
 *       : : : : : : : : :     |       with flags if the node pointed to
 *       +-+-+-+-+-+-+-+-+     +       is next)
 *   gtl | | | | | | | | |    /  MSB
 *       +-+-+-+-+-+-+-+-+   /
 * gtl+1                           (gtl = gotoLength)
 * 
*/ public final class FSA5 extends FSA { /** Default filler byte. */ public static final byte DEFAULT_FILLER = '_'; /** Default annotation byte. */ public static final byte DEFAULT_ANNOTATION = '+'; /** Automaton version as in the file header. */ public static final byte VERSION = 5; /** * Bit indicating that an arc corresponds to the last character of a sequence available when * building the automaton. */ public static final int BIT_FINAL_ARC = 1 << 0; /** * Bit indicating that an arc is the last one of the node's list and the following one belongs to * another node. */ public static final int BIT_LAST_ARC = 1 << 1; /** * Bit indicating that the target node of this arc follows it in the compressed automaton * structure (no goto field). */ public static final int BIT_TARGET_NEXT = 1 << 2; /** * An offset in the arc structure, where the address and flags field begins. In version 5 of FSA * automata, this value is constant (1, skip label). */ public static final int ADDRESS_OFFSET = 1; /** * An array of bytes with the internal representation of the automaton. Please see the * documentation of this class for more information on how this structure is organized. */ public final byte[] arcs; /** * The length of the node header structure (if the automaton was compiled with NUMBERS * option). Otherwise zero. */ public final int nodeDataLength; /** Flags for this automaton version. */ private Set flags; /** Number of bytes each address takes in full, expanded form (goto length). */ public final int gtl; /** Filler character. */ public final byte filler; /** Annotation character. */ public final byte annotation; /** Read and wrap a binary automaton in FSA version 5. */ FSA5(InputStream stream) throws IOException { DataInputStream in = new DataInputStream(stream); this.filler = in.readByte(); this.annotation = in.readByte(); final byte hgtl = in.readByte(); /* * Determine if the automaton was compiled with NUMBERS. If so, modify * ctl and goto fields accordingly. */ flags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT); if ((hgtl & 0xf0) != 0) { flags.add(NUMBERS); } flags = Collections.unmodifiableSet(flags); this.nodeDataLength = (hgtl >>> 4) & 0x0f; this.gtl = hgtl & 0x0f; arcs = readRemaining(in); } /** Returns the start node of this automaton. */ @Override public int getRootNode() { // Skip dummy node marking terminating state. final int epsilonNode = skipArc(getFirstArc(0)); // And follow the epsilon node's first (and only) arc. return getDestinationNodeOffset(getFirstArc(epsilonNode)); } /** {@inheritDoc} */ @Override public final int getFirstArc(int node) { return nodeDataLength + node; } /** {@inheritDoc} */ @Override public final int getNextArc(int arc) { if (isArcLast(arc)) return 0; else return skipArc(arc); } /** {@inheritDoc} */ @Override public int getArc(int node, byte label) { for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { if (getArcLabel(arc) == label) return arc; } // An arc labeled with "label" not found. return 0; } /** {@inheritDoc} */ @Override public int getEndNode(int arc) { final int nodeOffset = getDestinationNodeOffset(arc); assert nodeOffset != 0 : "No target node for terminal arcs."; return nodeOffset; } /** {@inheritDoc} */ @Override public byte getArcLabel(int arc) { return arcs[arc]; } /** {@inheritDoc} */ @Override public boolean isArcFinal(int arc) { return (arcs[arc + ADDRESS_OFFSET] & BIT_FINAL_ARC) != 0; } /** {@inheritDoc} */ @Override public boolean isArcTerminal(int arc) { return (0 == getDestinationNodeOffset(arc)); } /** * Returns the number encoded at the given node. The number equals the count of the set of * suffixes reachable from node (called its right language). */ @Override public int getRightLanguageCount(int node) { assert getFlags().contains(FSAFlags.NUMBERS) : "This FSA was not compiled with NUMBERS."; return decodeFromBytes(arcs, node, nodeDataLength); } /** * {@inheritDoc} * *

For this automaton version, an additional {@link FSAFlags#NUMBERS} flag may be set to * indicate the automaton contains extra fields for each node. */ @Override public Set getFlags() { return flags; } /** * Returns true if this arc has NEXT bit set. * * @see #BIT_LAST_ARC * @param arc The node's arc identifier. * @return Returns true if the argument is the last arc of a node. */ public boolean isArcLast(int arc) { return (arcs[arc + ADDRESS_OFFSET] & BIT_LAST_ARC) != 0; } /** * @see #BIT_TARGET_NEXT * @param arc The node's arc identifier. * @return Returns true if {@link #BIT_TARGET_NEXT} is set for this arc. */ public boolean isNextSet(int arc) { return (arcs[arc + ADDRESS_OFFSET] & BIT_TARGET_NEXT) != 0; } /** Returns an n-byte integer encoded in byte-packed representation. */ static final int decodeFromBytes(final byte[] arcs, final int start, final int n) { int r = 0; for (int i = n; --i >= 0; ) { r = r << 8 | (arcs[start + i] & 0xff); } return r; } /** Returns the address of the node pointed to by this arc. */ final int getDestinationNodeOffset(int arc) { if (isNextSet(arc)) { /* The destination node follows this arc in the array. */ return skipArc(arc); } else { /* * The destination node address has to be extracted from the arc's * goto field. */ return decodeFromBytes(arcs, arc + ADDRESS_OFFSET, gtl) >>> 3; } } /** Read the arc's layout and skip as many bytes, as needed. */ private int skipArc(int offset) { return offset + (isNextSet(offset) ? 1 + 1 /* label + flags */ : 1 + gtl /* label + flags/address */); } } ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/FSAFlags.java ================================================ package morfologik.fsa; import java.util.Set; /** FSA automaton flags. Where applicable, flags follow Daciuk's fsa package. */ public enum FSAFlags { /** Daciuk: flexible FSA encoding. */ FLEXIBLE(1 << 0), /** Daciuk: stop bit in use. */ STOPBIT(1 << 1), /** Daciuk: next bit in use. */ NEXTBIT(1 << 2), /** Daciuk: tails compression. */ TAILS(1 << 3), /* * These flags are outside of byte range (never occur in Daciuk's FSA). */ /** * The FSA contains right-language count numbers on states. * * @see FSA#getRightLanguageCount(int) */ NUMBERS(1 << 8), /** * The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package * compatibility). */ SEPARATORS(1 << 9); /** Bit mask for the corresponding flag. */ public final int bits; /** */ private FSAFlags(int bits) { this.bits = bits; } /** * @param flags The bitset with flags. * @return Returns true iff this flag is set in flags. */ public boolean isSet(int flags) { return (flags & bits) != 0; } /** * @param flags A set of flags to encode. * @return Returns the set of flags encoded as packed short. */ public static short asShort(Set flags) { short value = 0; for (FSAFlags f : flags) { value |= f.bits; } return value; } } ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/FSAHeader.java ================================================ package morfologik.fsa; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; /** Standard FSA file header, as described in fsa package documentation. */ public final class FSAHeader { /** FSA magic (4 bytes). */ static final int FSA_MAGIC = ('\\' << 24) | ('f' << 16) | ('s' << 8) | ('a'); /** Maximum length of the header block. */ static final int MAX_HEADER_LENGTH = 4 + 8; /** FSA version number. */ final byte version; FSAHeader(byte version) { this.version = version; } /** * Read FSA header and version from a stream, consuming read bytes. * * @param in The input stream to read data from. * @return Returns a valid {@link FSAHeader} with version information. * @throws IOException If the stream ends prematurely or if it contains invalid data. */ public static FSAHeader read(InputStream in) throws IOException { if (in.read() != ((FSA_MAGIC >>> 24)) || in.read() != ((FSA_MAGIC >>> 16) & 0xff) || in.read() != ((FSA_MAGIC >>> 8) & 0xff) || in.read() != ((FSA_MAGIC) & 0xff)) { throw new IOException("Invalid file header, probably not an FSA."); } int version = in.read(); if (version == -1) { throw new IOException("Truncated file, no version number."); } return new FSAHeader((byte) version); } /** * Writes FSA magic bytes and version information. * * @param os The stream to write to. * @param version Automaton version. * @throws IOException Rethrown if writing fails. */ public static void write(OutputStream os, byte version) throws IOException { os.write(FSA_MAGIC >> 24); os.write(FSA_MAGIC >> 16); os.write(FSA_MAGIC >> 8); os.write(FSA_MAGIC); os.write(version); } } ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/FSATraversal.java ================================================ package morfologik.fsa; import static morfologik.fsa.MatchResult.*; /** This class implements some common matching and scanning operations on a generic FSA. */ public final class FSATraversal { /** Target automaton. */ private final FSA fsa; /** * Traversals of the given FSA. * * @param fsa The target automaton for traversals. */ public FSATraversal(FSA fsa) { this.fsa = fsa; } /** * Calculate perfect hash for a given input sequence of bytes. The perfect hash requires that * {@link FSA} is built with {@link FSAFlags#NUMBERS} and corresponds to the sequential order of * input sequences used at automaton construction time. * * @param sequence The byte sequence to calculate perfect hash for. * @param start Start index in the sequence array. * @param length Length of the byte sequence, must be at least 1. * @param node The node to start traversal from, typically the {@linkplain FSA#getRootNode() root * node}. * @return Returns a unique integer assigned to the input sequence in the automaton (reflecting * the number of that sequence in the input used to build the automaton). Returns a negative * integer if the input sequence was not part of the input from which the automaton was * created. The type of mismatch is a constant defined in {@link MatchResult}. */ public int perfectHash(byte[] sequence, int start, int length, int node) { assert fsa.getFlags().contains(FSAFlags.NUMBERS) : "FSA not built with NUMBERS option."; assert length > 0 : "Must be a non-empty sequence."; int hash = 0; final int end = start + length - 1; int seqIndex = start; byte label = sequence[seqIndex]; // Seek through the current node's labels, looking for 'label', update hash. for (int arc = fsa.getFirstArc(node); arc != 0; ) { if (fsa.getArcLabel(arc) == label) { if (fsa.isArcFinal(arc)) { if (seqIndex == end) { return hash; } hash++; } if (fsa.isArcTerminal(arc)) { /* The automaton contains a prefix of the input sequence. */ return AUTOMATON_HAS_PREFIX; } // The sequence is a prefix of one of the sequences stored in the automaton. if (seqIndex == end) { return SEQUENCE_IS_A_PREFIX; } // Make a transition along the arc, go the target node's first arc. arc = fsa.getFirstArc(fsa.getEndNode(arc)); label = sequence[++seqIndex]; continue; } else { if (fsa.isArcFinal(arc)) { hash++; } if (!fsa.isArcTerminal(arc)) { hash += fsa.getRightLanguageCount(fsa.getEndNode(arc)); } } arc = fsa.getNextArc(arc); } if (seqIndex > start) { return AUTOMATON_HAS_PREFIX; } else { // Labels of this node ended without a match on the sequence. // Perfect hash does not exist. return NO_MATCH; } } /** * @param sequence The byte sequence to calculate perfect hash for. * @see #perfectHash(byte[], int, int, int) * @return Returns a unique integer assigned to the input sequence in the automaton (reflecting * the number of that sequence in the input used to build the automaton). Returns a negative * integer if the input sequence was not part of the input from which the automaton was * created. The type of mismatch is a constant defined in {@link MatchResult}. */ public int perfectHash(byte[] sequence) { return perfectHash(sequence, 0, sequence.length, fsa.getRootNode()); } /** * Same as {@link #match(byte[], int, int, int)}, but allows passing a reusable {@link * MatchResult} object so that no intermediate garbage is produced. * * @param reuse The {@link MatchResult} to reuse. * @param sequence Input sequence to look for in the automaton. * @param start Start index in the sequence array. * @param length Length of the byte sequence, must be at least 1. * @param node The node to start traversal from, typically the {@linkplain FSA#getRootNode() root * node}. * @return The same object as reuse, but with updated match {@link MatchResult#kind} * and other relevant fields. */ public MatchResult match(MatchResult reuse, byte[] sequence, int start, int length, int node) { if (node == 0) { reuse.reset(NO_MATCH, start, node); return reuse; } final FSA fsa = this.fsa; final int end = start + length; for (int i = start; i < end; i++) { final int arc = fsa.getArc(node, sequence[i]); if (arc != 0) { if (i + 1 == end && fsa.isArcFinal(arc)) { /* The automaton has an exact match of the input sequence. */ reuse.reset(EXACT_MATCH, i, node); return reuse; } if (fsa.isArcTerminal(arc)) { /* The automaton contains a prefix of the input sequence. */ reuse.reset(AUTOMATON_HAS_PREFIX, i + 1, node); return reuse; } // Make a transition along the arc. node = fsa.getEndNode(arc); } else { if (i > start) { reuse.reset(AUTOMATON_HAS_PREFIX, i, node); } else { reuse.reset(NO_MATCH, i, node); } return reuse; } } /* The sequence is a prefix of at least one sequence in the automaton. */ reuse.reset(SEQUENCE_IS_A_PREFIX, 0, node); return reuse; } /** * Finds a matching path in the dictionary for a given sequence of labels from sequence * and starting at node node. * * @param sequence Input sequence to look for in the automaton. * @param start Start index in the sequence array. * @param length Length of the byte sequence, must be at least 1. * @param node The node to start traversal from, typically the {@linkplain FSA#getRootNode() root * node}. * @see #match(byte [], int) * @return {@link MatchResult} with updated match {@link MatchResult#kind}. */ public MatchResult match(byte[] sequence, int start, int length, int node) { return match(new MatchResult(), sequence, start, length, node); } /** * @param sequence Input sequence to look for in the automaton. * @param node The node to start traversal from, typically the {@linkplain FSA#getRootNode() root * node}. * @see #match(byte [], int) * @return {@link MatchResult} with updated match {@link MatchResult#kind}. */ public MatchResult match(byte[] sequence, int node) { return match(sequence, 0, sequence.length, node); } /** * @param sequence Input sequence to look for in the automaton. * @see #match(byte [], int) * @return {@link MatchResult} with updated match {@link MatchResult#kind}. */ public MatchResult match(byte[] sequence) { return match(sequence, fsa.getRootNode()); } } ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/MatchResult.java ================================================ package morfologik.fsa; /** * A matching result returned from {@link FSATraversal}. * * @see FSATraversal */ public final class MatchResult { /** The automaton has exactly one match for the input sequence. */ public static final int EXACT_MATCH = 0; /** * The automaton has no match for the input sequence and no sequence in the automaton is a prefix * of the input. * *

Note that to check for a general "input does not exist in the automaton" you have to check * for both {@link #NO_MATCH} and {@link #AUTOMATON_HAS_PREFIX}. */ public static final int NO_MATCH = -1; /** * The automaton contains a prefix of the input sequence (but the full sequence does not exist). * This translates to: one of the input sequences used to build the automaton is a prefix of the * input sequence, but the input sequence contains a non-existent suffix. * *

{@link MatchResult#index} will contain an index of the first character of the input sequence * not present in the dictionary. */ public static final int AUTOMATON_HAS_PREFIX = -3; /** * The sequence is a prefix of at least one sequence in the automaton. {@link MatchResult#node} * returns the node from which all sequences with the given prefix start in the automaton. */ public static final int SEQUENCE_IS_A_PREFIX = -4; /** * One of the match types defined in this class. * * @see #NO_MATCH * @see #EXACT_MATCH * @see #AUTOMATON_HAS_PREFIX * @see #SEQUENCE_IS_A_PREFIX */ public int kind; /** Input sequence's index, interpretation depends on {@link #kind}. */ public int index; /** Automaton node, interpretation depends on the {@link #kind}. */ public int node; MatchResult(int kind, int index, int node) { reset(kind, index, node); } MatchResult(int kind) { reset(kind, 0, 0); } public MatchResult() { reset(NO_MATCH, 0, 0); } final void reset(int kind, int index, int node) { this.kind = kind; this.index = index; this.node = node; } } ================================================ FILE: morfologik-fsa/src/main/java/morfologik/fsa/StateVisitor.java ================================================ package morfologik.fsa; /** * State visitor. * * @see FSA#visitInPostOrder(StateVisitor) * @see FSA#visitInPreOrder(StateVisitor) */ public interface StateVisitor { public boolean accept(int state); } ================================================ FILE: morfologik-fsa-builders/pom.xml ================================================ 4.0.0 org.carrot2 morfologik-parent 2.2.0-SNAPSHOT ../pom.xml morfologik-fsa-builders bundle Morfologik FSA (Builder) Morfologik Finite State Automata Builder ../etc/forbidden-apis/signatures.txt org.carrot2.morfologik.fsa_builders org.carrot2 morfologik-fsa ${project.version} com.carrotsearch hppc org.apache.felix maven-bundle-plugin morfologik.fsa.builders * ================================================ FILE: morfologik-fsa-builders/src/main/java/morfologik/fsa/builders/CFSA2Serializer.java ================================================ package morfologik.fsa.builders; import static morfologik.fsa.CFSA2.*; import static morfologik.fsa.FSAFlags.*; import com.carrotsearch.hppc.BoundedProportionalArraySizingStrategy; import com.carrotsearch.hppc.IntArrayList; import com.carrotsearch.hppc.IntIntHashMap; import com.carrotsearch.hppc.IntStack; import com.carrotsearch.hppc.cursors.IntCursor; import com.carrotsearch.hppc.cursors.IntIntCursor; import java.io.IOException; import java.io.OutputStream; import java.util.BitSet; import java.util.Comparator; import java.util.EnumSet; import java.util.Locale; import java.util.PriorityQueue; import java.util.Set; import java.util.TreeSet; import java.util.logging.Level; import java.util.logging.Logger; import morfologik.fsa.CFSA2; import morfologik.fsa.FSA; import morfologik.fsa.FSAFlags; import morfologik.fsa.FSAHeader; import morfologik.fsa.StateVisitor; import morfologik.fsa.builders.FSAUtils.IntIntHolder; /** * Serializes in-memory {@link FSA} graphs to {@link CFSA2}. * *

It is possible to serialize the automaton with numbers required for perfect hashing. See * {@link #withNumbers()} method. * * @see CFSA2 */ public final class CFSA2Serializer implements FSASerializer { private final Logger logger = Logger.getLogger(getClass().getName()); /** Supported flags. */ private static final EnumSet flags = EnumSet.of(NUMBERS, FLEXIBLE, STOPBIT, NEXTBIT); /** No-state id. */ private static final int NO_STATE = -1; /** * true if we should serialize with numbers. * * @see #withNumbers() */ private boolean withNumbers; /** A hash map of [state, offset] pairs. */ private IntIntHashMap offsets = new IntIntHashMap(); /** A hash map of [state, right-language-count] pairs. */ private IntIntHashMap numbers = new IntIntHashMap(); /** Scratch array for serializing vints. */ private final byte[] scratch = new byte[5]; /** The most frequent labels for integrating with the flags field. */ private byte[] labelsIndex; /** * Inverted index of labels to be integrated with flags field. A label at * index i has the index or zero (no integration). */ private int[] labelsInvIndex; /** * Serialize the automaton with the number of right-language sequences in each node. This is * required to implement perfect hashing. The numbering also preserves the order of input * sequences. * * @return Returns the same object for easier call chaining. */ public CFSA2Serializer withNumbers() { withNumbers = true; return this; } /** * Serializes any {@link FSA} to {@link CFSA2} stream. * * @see #withNumbers() * @return Returns os for chaining. */ @Override public T serialize(final FSA fsa, T os) throws IOException { /* * Calculate the most frequent labels and build indexed labels dictionary. */ computeLabelsIndex(fsa); /* * Calculate the number of bytes required for the node data, if * serializing with numbers. */ if (withNumbers) { this.numbers = FSAUtils.rightLanguageForAllStates(fsa); } /* * Linearize all the states, optimizing their layout. */ IntArrayList linearized = linearize(fsa); /* * Emit the header. */ FSAHeader.write(os, CFSA2.VERSION); EnumSet fsaFlags = EnumSet.of(FLEXIBLE, STOPBIT, NEXTBIT); if (withNumbers) { fsaFlags.add(NUMBERS); } final short sflags = FSAFlags.asShort(fsaFlags); os.write((sflags >> 8) & 0xFF); os.write((sflags) & 0xFF); /* * Emit labels index. */ os.write(labelsIndex.length); os.write(labelsIndex); /* * Emit the automaton. */ int size = emitNodes(fsa, os, linearized); assert size == 0 : "Size changed in the final pass?"; return os; } /** Compute a set of labels to be integrated with the flags field. */ private void computeLabelsIndex(final FSA fsa) { // Compute labels count. final int[] countByValue = new int[256]; fsa.visitAllStates( new StateVisitor() { public boolean accept(int state) { for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) countByValue[fsa.getArcLabel(arc) & 0xff]++; return true; } }); // Order by descending frequency of counts and increasing label value. Comparator comparator = new Comparator() { public int compare(IntIntHolder o1, IntIntHolder o2) { int countDiff = o2.b - o1.b; if (countDiff == 0) { countDiff = o1.a - o2.a; } return countDiff; } }; TreeSet labelAndCount = new TreeSet(comparator); for (int label = 0; label < countByValue.length; label++) { if (countByValue[label] > 0) { labelAndCount.add(new IntIntHolder(label, countByValue[label])); } } labelsIndex = new byte[1 + Math.min(labelAndCount.size(), CFSA2.LABEL_INDEX_SIZE)]; labelsInvIndex = new int[256]; for (int i = labelsIndex.length - 1; i > 0 && !labelAndCount.isEmpty(); i--) { IntIntHolder p = labelAndCount.first(); labelAndCount.remove(p); labelsInvIndex[p.a] = i; labelsIndex[i] = (byte) p.a; } } /** Return supported flags. */ @Override public Set getFlags() { return flags; } /** Linearization of states. */ private IntArrayList linearize(final FSA fsa) throws IOException { /* * Compute the states with most inlinks. These should be placed as close to the * start of the automaton, as possible so that v-coded addresses are tiny. */ final IntIntHashMap inlinkCount = computeInlinkCount(fsa); /* * An array of ordered states for serialization. */ final IntArrayList linearized = new IntArrayList(0, new BoundedProportionalArraySizingStrategy(1000, 10000, 1.5f)); /* * Determine which states should be linearized first (at fixed positions) so as to * minimize the place occupied by goto fields. */ int maxStates = Integer.MAX_VALUE; int minInlinkCount = 2; int[] states = computeFirstStates(inlinkCount, maxStates, minInlinkCount); /* * Compute initial addresses, without node rearrangements. */ int serializedSize = linearizeAndCalculateOffsets(fsa, new IntArrayList(), linearized, offsets); /* * Probe for better node arrangements by selecting between [lower, upper] * nodes from the potential candidate nodes list. */ IntArrayList sublist = new IntArrayList(); sublist.buffer = states; sublist.elementsCount = states.length; /* * Probe the initial region a little bit, looking for optimal cut. It can't be binary search * because the result isn't monotonic. */ log(Level.FINE, "Compacting, initial output size: %,d", serializedSize); int cutAt = 0; for (int cut = Math.min(25, states.length); cut <= Math.min(150, states.length); cut += 25) { sublist.elementsCount = cut; int newSize = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets); log(Level.FINE, "Moved %,d states, output size: %,d", sublist.size(), newSize); if (newSize >= serializedSize) { break; } cutAt = cut; } /* * Cut at the calculated point and repeat linearization. */ sublist.elementsCount = cutAt; int size = linearizeAndCalculateOffsets(fsa, sublist, linearized, offsets); log(Level.FINE, "%,d states moved, final size: %,d", sublist.size(), size); return linearized; } private void log(Level level, String msg, Object... args) { logger.log(level, String.format(Locale.ROOT, msg, args)); } /** * Linearize all states, putting states in front of the automaton and calculating * stable state offsets. */ private int linearizeAndCalculateOffsets( FSA fsa, IntArrayList states, IntArrayList linearized, IntIntHashMap offsets) throws IOException { final BitSet visited = new BitSet(); final IntStack nodes = new IntStack(); linearized.clear(); /* * Linearize states with most inlinks first. */ for (int i = 0; i < states.size(); i++) { linearizeState(fsa, nodes, linearized, visited, states.get(i)); } /* * Linearize the remaining states by chaining them one after another, in depth-order. */ nodes.push(fsa.getRootNode()); while (!nodes.isEmpty()) { final int node = nodes.pop(); if (visited.get(node)) continue; linearizeState(fsa, nodes, linearized, visited, node); } /* * Calculate new state offsets. This is iterative. We start with * maximum potential offsets and recalculate until converged. */ int MAX_OFFSET = Integer.MAX_VALUE; for (IntCursor c : linearized) { offsets.put(c.value, MAX_OFFSET); } int i, j = 0; while ((i = emitNodes(fsa, null, linearized)) > 0) { j = i; } return j; } /** Add a state to linearized list. */ private void linearizeState( final FSA fsa, IntStack nodes, IntArrayList linearized, BitSet visited, int node) { linearized.add(node); visited.set(node); for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { if (!fsa.isArcTerminal(arc)) { final int target = fsa.getEndNode(arc); if (!visited.get(target)) nodes.push(target); } } } /** * Compute the set of states that should be linearized first to minimize other states goto length. */ private int[] computeFirstStates(IntIntHashMap inlinkCount, int maxStates, int minInlinkCount) { Comparator comparator = new Comparator() { public int compare(IntIntHolder o1, IntIntHolder o2) { int v = o1.a - o2.a; return v == 0 ? (o1.b - o2.b) : v; } }; PriorityQueue stateInlink = new PriorityQueue(1, comparator); IntIntHolder scratch = new IntIntHolder(); for (IntIntCursor c : inlinkCount) { if (c.value > minInlinkCount) { scratch.a = c.value; scratch.b = c.key; if (stateInlink.size() < maxStates || comparator.compare(scratch, stateInlink.peek()) > 0) { stateInlink.add(new IntIntHolder(c.value, c.key)); if (stateInlink.size() > maxStates) { stateInlink.remove(); } } } } int[] states = new int[stateInlink.size()]; for (int position = states.length; !stateInlink.isEmpty(); ) { IntIntHolder i = stateInlink.remove(); states[--position] = i.b; } return states; } /** Compute in-link count for each state. */ private IntIntHashMap computeInlinkCount(final FSA fsa) { IntIntHashMap inlinkCount = new IntIntHashMap(); BitSet visited = new BitSet(); IntStack nodes = new IntStack(); nodes.push(fsa.getRootNode()); while (!nodes.isEmpty()) { final int node = nodes.pop(); if (visited.get(node)) continue; visited.set(node); for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { if (!fsa.isArcTerminal(arc)) { final int target = fsa.getEndNode(arc); inlinkCount.putOrAdd(target, 1, 1); if (!visited.get(target)) nodes.push(target); } } } return inlinkCount; } /** Update arc offsets assuming the given goto length. */ private int emitNodes(FSA fsa, OutputStream os, IntArrayList linearized) throws IOException { int offset = 0; // Add epsilon state. offset += emitNodeData(os, 0); if (fsa.getRootNode() != 0) offset += emitArc(os, BIT_LAST_ARC, (byte) '^', offsets.get(fsa.getRootNode())); else offset += emitArc(os, BIT_LAST_ARC, (byte) '^', 0); boolean offsetsChanged = false; final int max = linearized.size(); for (IntCursor c : linearized) { final int state = c.value; final int nextState = c.index + 1 < max ? linearized.get(c.index + 1) : NO_STATE; if (os == null) { offsetsChanged |= (offsets.get(state) != offset); offsets.put(state, offset); } else { assert offsets.get(state) == offset : state + " " + offsets.get(state) + " " + offset; } offset += emitNodeData(os, withNumbers ? numbers.get(state) : 0); offset += emitNodeArcs(fsa, os, state, nextState); } return offsetsChanged ? offset : 0; } /** Emit all arcs of a single node. */ private int emitNodeArcs(FSA fsa, OutputStream os, final int state, final int nextState) throws IOException { int offset = 0; for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { int targetOffset; final int target; if (fsa.isArcTerminal(arc)) { target = 0; targetOffset = 0; } else { target = fsa.getEndNode(arc); targetOffset = offsets.get(target); } int flags = 0; if (fsa.isArcFinal(arc)) { flags |= BIT_FINAL_ARC; } if (fsa.getNextArc(arc) == 0) { flags |= BIT_LAST_ARC; } if (targetOffset != 0 && target == nextState) { flags |= BIT_TARGET_NEXT; targetOffset = 0; } offset += emitArc(os, flags, fsa.getArcLabel(arc), targetOffset); } return offset; } /** */ private int emitArc(OutputStream os, int flags, byte label, int targetOffset) throws IOException { int length = 0; int labelIndex = labelsInvIndex[label & 0xff]; if (labelIndex > 0) { if (os != null) os.write(flags | labelIndex); length++; } else { if (os != null) { os.write(flags); os.write(label); } length += 2; } if ((flags & BIT_TARGET_NEXT) == 0) { int len = writeVInt(scratch, 0, targetOffset); if (os != null) { os.write(scratch, 0, len); } length += len; } return length; } /** */ private int emitNodeData(OutputStream os, int number) throws IOException { int size = 0; if (withNumbers) { size = writeVInt(scratch, 0, number); if (os != null) { os.write(scratch, 0, size); } } return size; } /** */ @Override public CFSA2Serializer withFiller(byte filler) { throw new UnsupportedOperationException("CFSA2 does not support filler. Use .info file."); } /** */ @Override public CFSA2Serializer withAnnotationSeparator(byte annotationSeparator) { throw new UnsupportedOperationException("CFSA2 does not support separator. Use .info file."); } /** Write a v-int to a byte array. */ static int writeVInt(byte[] array, int offset, int value) { assert value >= 0 : "Can't v-code negative ints."; while (value > 0x7F) { array[offset++] = (byte) (0x80 | (value & 0x7F)); value >>= 7; } array[offset++] = (byte) value; return offset; } } ================================================ FILE: morfologik-fsa-builders/src/main/java/morfologik/fsa/builders/ConstantArcSizeFSA.java ================================================ package morfologik.fsa.builders; import java.util.Collections; import java.util.Set; import morfologik.fsa.FSA; import morfologik.fsa.FSAFlags; /** * An FSA with constant-size arc representation produced directly by {@link FSABuilder}. * * @see FSABuilder */ final class ConstantArcSizeFSA extends FSA { /** Size of the target address field (constant for the builder). */ public static final int TARGET_ADDRESS_SIZE = 4; /** Size of the flags field (constant for the builder). */ public static final int FLAGS_SIZE = 1; /** Size of the label field (constant for the builder). */ public static final int LABEL_SIZE = 1; /** Size of a single arc structure. */ public static final int ARC_SIZE = FLAGS_SIZE + LABEL_SIZE + TARGET_ADDRESS_SIZE; /** Offset of the flags field inside an arc. */ public static final int FLAGS_OFFSET = 0; /** Offset of the label field inside an arc. */ public static final int LABEL_OFFSET = FLAGS_SIZE; /** Offset of the address field inside an arc. */ public static final int ADDRESS_OFFSET = LABEL_OFFSET + LABEL_SIZE; /** A dummy address of the terminal state. */ static final int TERMINAL_STATE = 0; /** An arc flag indicating the target node of an arc corresponds to a final state. */ public static final int BIT_ARC_FINAL = 1 << 1; /** An arc flag indicating the arc is last within its state. */ public static final int BIT_ARC_LAST = 1 << 0; /** * An epsilon state. The first and only arc of this state points either to the root or to the * terminal state, indicating an empty automaton. */ private final int epsilon; /** FSA data, serialized as a byte array. */ private final byte[] data; /** * @param data FSA data. There must be no trailing bytes after the last state. */ ConstantArcSizeFSA(byte[] data, int epsilon) { assert epsilon == 0 : "Epsilon is not zero?"; this.epsilon = epsilon; this.data = data; } @Override public int getRootNode() { return getEndNode(getFirstArc(epsilon)); } @Override public int getFirstArc(int node) { return node; } @Override public int getArc(int node, byte label) { for (int arc = getFirstArc(node); arc != 0; arc = getNextArc(arc)) { if (getArcLabel(arc) == label) return arc; } return 0; } @Override public int getNextArc(int arc) { if (isArcLast(arc)) return 0; return arc + ARC_SIZE; } @Override public byte getArcLabel(int arc) { return data[arc + LABEL_OFFSET]; } /** Fills the target state address of an arc. */ private int getArcTarget(int arc) { arc += ADDRESS_OFFSET; return (data[arc]) << 24 | (data[arc + 1] & 0xff) << 16 | (data[arc + 2] & 0xff) << 8 | (data[arc + 3] & 0xff); } @Override public boolean isArcFinal(int arc) { return (data[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0; } @Override public boolean isArcTerminal(int arc) { return getArcTarget(arc) == 0; } private boolean isArcLast(int arc) { return (data[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0; } @Override public int getEndNode(int arc) { return getArcTarget(arc); } @Override public Set getFlags() { return Collections.emptySet(); } } ================================================ FILE: morfologik-fsa-builders/src/main/java/morfologik/fsa/builders/FSA5Serializer.java ================================================ package morfologik.fsa.builders; import static morfologik.fsa.FSAFlags.*; import com.carrotsearch.hppc.IntIntHashMap; import com.carrotsearch.hppc.IntStack; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.BitSet; import java.util.EnumSet; import java.util.Set; import morfologik.fsa.FSA; import morfologik.fsa.FSA5; import morfologik.fsa.FSAFlags; import morfologik.fsa.FSAHeader; /** * Serializes in-memory {@link FSA} graphs to a binary format compatible with Jan Daciuk's fsa * 's package FSA5 format. * *

It is possible to serialize the automaton with numbers required for perfect hashing. See * {@link #withNumbers()} method. * * @see FSA5 * @see FSA#read(java.io.InputStream) */ public final class FSA5Serializer implements FSASerializer { /** Maximum number of bytes for a serialized arc. */ private static final int MAX_ARC_SIZE = 1 + 5; /** Maximum number of bytes for per-node data. */ private static final int MAX_NODE_DATA_SIZE = 16; /** Number of bytes for the arc's flags header (arc representation without the goto address). */ private static final int SIZEOF_FLAGS = 1; /** Supported flags. */ private static final EnumSet flags = EnumSet.of(NUMBERS, SEPARATORS, FLEXIBLE, STOPBIT, NEXTBIT); /** * @see FSA5#filler */ public byte fillerByte = FSA5.DEFAULT_FILLER; /** * @see FSA5#annotation */ public byte annotationByte = FSA5.DEFAULT_ANNOTATION; /** * true if we should serialize with numbers. * * @see #withNumbers() */ private boolean withNumbers; /** A hash map of [state, offset] pairs. */ private IntIntHashMap offsets = new IntIntHashMap(); /** A hash map of [state, right-language-count] pairs. */ private IntIntHashMap numbers = new IntIntHashMap(); /** * Serialize the automaton with the number of right-language sequences in each node. This is * required to implement perfect hashing. The numbering also preserves the order of input * sequences. * * @return Returns the same object for easier call chaining. */ public FSA5Serializer withNumbers() { withNumbers = true; return this; } /** {@inheritDoc} */ @Override public FSA5Serializer withFiller(byte filler) { this.fillerByte = filler; return this; } /** {@inheritDoc} */ @Override public FSA5Serializer withAnnotationSeparator(byte annotationSeparator) { this.annotationByte = annotationSeparator; return this; } /** * Serialize root state s to an output stream in FSA5 format. * * @see #withNumbers() * @return Returns os for chaining. */ @Override public T serialize(final FSA fsa, T os) throws IOException { // Prepare space for arc offsets and linearize all the states. int[] linearized = linearize(fsa); /* * Calculate the number of bytes required for the node data, if * serializing with numbers. */ int nodeDataLength = 0; if (withNumbers) { this.numbers = FSAUtils.rightLanguageForAllStates(fsa); int maxNumber = numbers.get(fsa.getRootNode()); while (maxNumber > 0) { nodeDataLength++; maxNumber >>>= 8; } } // Calculate minimal goto length. int gtl = 1; while (true) { // First pass: calculate offsets of states. if (!emitArcs(fsa, null, linearized, gtl, nodeDataLength)) { gtl++; continue; } // Second pass: check if goto overflows anywhere. if (emitArcs(fsa, null, linearized, gtl, nodeDataLength)) break; gtl++; } /* * Emit the header. */ FSAHeader.write(os, FSA5.VERSION); os.write(fillerByte); os.write(annotationByte); os.write((nodeDataLength << 4) | gtl); /* * Emit the automaton. */ boolean gtlUnchanged = emitArcs(fsa, os, linearized, gtl, nodeDataLength); assert gtlUnchanged : "gtl changed in the final pass."; return os; } /** Return supported flags. */ @Override public Set getFlags() { return flags; } /** Linearization of states. */ private int[] linearize(final FSA fsa) { int[] linearized = new int[0]; int last = 0; BitSet visited = new BitSet(); IntStack nodes = new IntStack(); nodes.push(fsa.getRootNode()); while (!nodes.isEmpty()) { final int node = nodes.pop(); if (visited.get(node)) { continue; } if (last >= linearized.length) { linearized = Arrays.copyOf(linearized, linearized.length + 100000); } visited.set(node); linearized[last++] = node; for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { if (!fsa.isArcTerminal(arc)) { int target = fsa.getEndNode(arc); if (!visited.get(target)) nodes.push(target); } } } return Arrays.copyOf(linearized, last); } /** Update arc offsets assuming the given goto length. */ private boolean emitArcs(FSA fsa, OutputStream os, int[] linearized, int gtl, int nodeDataLength) throws IOException { final ByteBuffer bb = ByteBuffer.allocate(Math.max(MAX_NODE_DATA_SIZE, MAX_ARC_SIZE)); int offset = 0; // Add dummy terminal state. offset += emitNodeData(bb, os, nodeDataLength, 0); offset += emitArc(bb, os, gtl, 0, (byte) 0, 0); // Add epsilon state. offset += emitNodeData(bb, os, nodeDataLength, 0); if (fsa.getRootNode() != 0) offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC | FSA5.BIT_TARGET_NEXT, (byte) '^', 0); else offset += emitArc(bb, os, gtl, FSA5.BIT_LAST_ARC, (byte) '^', 0); int maxStates = linearized.length; for (int j = 0; j < maxStates; j++) { final int s = linearized[j]; if (os == null) { offsets.put(s, offset); } else { assert offsets.get(s) == offset : s + " " + offsets.get(s) + " " + offset; } offset += emitNodeData(bb, os, nodeDataLength, withNumbers ? numbers.get(s) : 0); for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) { int targetOffset; final int target; if (fsa.isArcTerminal(arc)) { targetOffset = 0; target = 0; } else { target = fsa.getEndNode(arc); targetOffset = offsets.get(target); } int flags = 0; if (fsa.isArcFinal(arc)) { flags |= FSA5.BIT_FINAL_ARC; } if (fsa.getNextArc(arc) == 0) { flags |= FSA5.BIT_LAST_ARC; if (j + 1 < maxStates && target == linearized[j + 1] && targetOffset != 0) { flags |= FSA5.BIT_TARGET_NEXT; targetOffset = 0; } } int bytes = emitArc(bb, os, gtl, flags, fsa.getArcLabel(arc), targetOffset); if (bytes < 0) // gtl too small. interrupt eagerly. return false; offset += bytes; } } return true; } /** */ private int emitArc( ByteBuffer bb, OutputStream os, int gtl, int flags, byte label, int targetOffset) throws IOException { int arcBytes = (flags & FSA5.BIT_TARGET_NEXT) != 0 ? SIZEOF_FLAGS : gtl; flags |= (targetOffset << 3); bb.put(label); for (int b = 0; b < arcBytes; b++) { bb.put((byte) flags); flags >>>= 8; } if (flags != 0) { // gtl too small. interrupt eagerly. return -1; } bb.flip(); int bytes = bb.remaining(); if (os != null) { os.write(bb.array(), bb.position(), bb.remaining()); } bb.clear(); return bytes; } /** */ private int emitNodeData(ByteBuffer bb, OutputStream os, int nodeDataLength, int number) throws IOException { if (nodeDataLength > 0 && os != null) { for (int i = 0; i < nodeDataLength; i++) { bb.put((byte) number); number >>>= 8; } bb.flip(); os.write(bb.array(), bb.position(), bb.remaining()); bb.clear(); } return nodeDataLength; } } ================================================ FILE: morfologik-fsa-builders/src/main/java/morfologik/fsa/builders/FSABuilder.java ================================================ package morfologik.fsa.builders; import static morfologik.fsa.builders.ConstantArcSizeFSA.*; import java.util.*; import morfologik.fsa.FSA; /** * Fast, memory-conservative finite state automaton builder, returning an in-memory {@link FSA} that * is a tradeoff between construction speed and memory consumption. Use serializers to compress the * returned automaton into more compact form. * * @see FSASerializer */ public final class FSABuilder { /** * Debug and information constants. * * @see FSABuilder#getInfo() */ public enum InfoEntry { SERIALIZATION_BUFFER_SIZE("Serialization buffer size"), SERIALIZATION_BUFFER_REALLOCATIONS("Serialization buffer reallocs"), CONSTANT_ARC_AUTOMATON_SIZE("Constant arc FSA size"), MAX_ACTIVE_PATH_LENGTH("Max active path"), STATE_REGISTRY_TABLE_SLOTS("Registry hash slots"), STATE_REGISTRY_SIZE("Registry hash entries"), ESTIMATED_MEMORY_CONSUMPTION_MB("Estimated mem consumption (MB)"); private final String stringified; InfoEntry(String stringified) { this.stringified = stringified; } @Override public String toString() { return stringified; } } /** A megabyte. */ private static final int MB = 1024 * 1024; /** Internal serialized FSA buffer expand ratio. */ private static final int BUFFER_GROWTH_SIZE = 5 * MB; /** Maximum number of labels from a single state. */ private static final int MAX_LABELS = 256; /** A comparator comparing full byte arrays. Unsigned byte comparisons ('C'-locale). */ public static final Comparator LEXICAL_ORDERING = new Comparator() { public int compare(byte[] o1, byte[] o2) { return FSABuilder.compare(o1, 0, o1.length, o2, 0, o2.length); } }; /** Internal serialized FSA buffer expand ratio. */ private final int bufferGrowthSize; /** * Holds serialized and mutable states. Each state is a sequential list of arcs, the last arc is * marked with {@link #BIT_ARC_LAST}. */ private byte[] serialized = new byte[0]; /** * Number of bytes already taken in {@link #serialized}. Start from 1 to keep 0 a sentinel value * (for the hash set and final state). */ private int size; /** * States on the "active path" (still mutable). Values are addresses of each state's first arc. */ private int[] activePath = new int[0]; /** Current length of the active path. */ private int activePathLen; /** The next offset at which an arc will be added to the given state on {@link #activePath}. */ private int[] nextArcOffset = new int[0]; /** Root state. If negative, the automaton has been built already and cannot be extended. */ private int root; /** * An epsilon state. The first and only arc of this state points either to the root or to the * terminal state, indicating an empty automaton. */ private int epsilon; /** * Hash set of state addresses in {@link #serialized}, hashed by {@link #hash(int, int)}. Zero * reserved for an unoccupied slot. */ private int[] hashSet = new int[2]; /** Number of entries currently stored in {@link #hashSet}. */ private int hashSize = 0; /** * Previous sequence added to the automaton in {@link #add(byte[], int, int)}. Used in assertions * only. */ private byte[] previous; /** Information about the automaton and its compilation. */ private TreeMap info; /** {@link #previous} sequence's length, used in assertions only. */ private int previousLength; /** */ public FSABuilder() { this(BUFFER_GROWTH_SIZE); } /** * @param bufferGrowthSize Buffer growth size (in bytes) when constructing the automaton. */ public FSABuilder(int bufferGrowthSize) { this.bufferGrowthSize = Math.max(bufferGrowthSize, ARC_SIZE * MAX_LABELS); // Allocate epsilon state. epsilon = allocateState(1); serialized[epsilon + FLAGS_OFFSET] |= BIT_ARC_LAST; // Allocate root, with an initial empty set of output arcs. expandActivePath(1); root = activePath[0]; } /** * Add a single sequence of bytes to the FSA. The input must be lexicographically greater than any * previously added sequence. * * @param sequence The array holding input sequence of bytes. * @param start Starting offset (inclusive) * @param len Length of the input sequence (at least 1 byte). */ public void add(byte[] sequence, int start, int len) { assert serialized != null : "Automaton already built."; assert previous == null || len == 0 || compare(previous, 0, previousLength, sequence, start, len) <= 0 : "Input must be sorted: " + Arrays.toString(Arrays.copyOf(previous, previousLength)) + " >= " + Arrays.toString(Arrays.copyOfRange(sequence, start, len)); assert setPrevious(sequence, start, len); // Determine common prefix length. final int commonPrefix = commonPrefix(sequence, start, len); // Make room for extra states on active path, if needed. expandActivePath(len); // Freeze all the states after the common prefix. for (int i = activePathLen - 1; i > commonPrefix; i--) { final int frozenState = freezeState(i); setArcTarget(nextArcOffset[i - 1] - ARC_SIZE, frozenState); nextArcOffset[i] = activePath[i]; } // Create arcs to new suffix states. for (int i = commonPrefix + 1, j = start + commonPrefix; i <= len; i++) { final int p = nextArcOffset[i - 1]; serialized[p + FLAGS_OFFSET] = (byte) (i == len ? BIT_ARC_FINAL : 0); serialized[p + LABEL_OFFSET] = sequence[j++]; setArcTarget(p, i == len ? TERMINAL_STATE : activePath[i]); nextArcOffset[i - 1] = p + ARC_SIZE; } // Save last sequence's length so that we don't need to calculate it again. this.activePathLen = len; } /** Number of serialization buffer reallocations. */ private int serializationBufferReallocations; /** * @return Finalizes the construction of the automaton and returns it. */ public FSA complete() { add(new byte[0], 0, 0); if (nextArcOffset[0] - activePath[0] == 0) { // An empty FSA. setArcTarget(epsilon, TERMINAL_STATE); } else { // An automaton with at least a single arc from root. root = freezeState(0); setArcTarget(epsilon, root); } info = new TreeMap(); info.put(InfoEntry.SERIALIZATION_BUFFER_SIZE, serialized.length); info.put(InfoEntry.SERIALIZATION_BUFFER_REALLOCATIONS, serializationBufferReallocations); info.put(InfoEntry.CONSTANT_ARC_AUTOMATON_SIZE, size); info.put(InfoEntry.MAX_ACTIVE_PATH_LENGTH, activePath.length); info.put(InfoEntry.STATE_REGISTRY_TABLE_SLOTS, hashSet.length); info.put(InfoEntry.STATE_REGISTRY_SIZE, hashSize); info.put( InfoEntry.ESTIMATED_MEMORY_CONSUMPTION_MB, (this.serialized.length + this.hashSet.length * 4) / (double) MB); final FSA fsa = new ConstantArcSizeFSA(java.util.Arrays.copyOf(this.serialized, this.size), epsilon); this.serialized = null; this.hashSet = null; return fsa; } /** * Build a minimal, deterministic automaton from a sorted list of byte sequences. * * @param input Input sequences to build automaton from. * @return Returns the automaton encoding all input sequences. */ public static FSA build(byte[][] input) { final FSABuilder builder = new FSABuilder(); for (byte[] chs : input) { builder.add(chs, 0, chs.length); } return builder.complete(); } /** * Build a minimal, deterministic automaton from an iterable list of byte sequences. * * @param input Input sequences to build automaton from. * @return Returns the automaton encoding all input sequences. */ public static FSA build(Iterable input) { final FSABuilder builder = new FSABuilder(); for (byte[] chs : input) { builder.add(chs, 0, chs.length); } return builder.complete(); } /** * @return Returns various statistics concerning the FSA and its compilation. * @see InfoEntry */ public Map getInfo() { return info; } /** Is this arc the state's last? */ private boolean isArcLast(int arc) { return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_LAST) != 0; } /** Is this arc final? */ private boolean isArcFinal(int arc) { return (serialized[arc + FLAGS_OFFSET] & BIT_ARC_FINAL) != 0; } /** Get label's arc. */ private byte getArcLabel(int arc) { return serialized[arc + LABEL_OFFSET]; } /** Fills the target state address of an arc. */ private void setArcTarget(int arc, int state) { arc += ADDRESS_OFFSET + TARGET_ADDRESS_SIZE; for (int i = 0; i < TARGET_ADDRESS_SIZE; i++) { serialized[--arc] = (byte) state; state >>>= 8; } } /** Returns the address of an arc. */ private int getArcTarget(int arc) { arc += ADDRESS_OFFSET; return (serialized[arc]) << 24 | (serialized[arc + 1] & 0xff) << 16 | (serialized[arc + 2] & 0xff) << 8 | (serialized[arc + 3] & 0xff); } /** * @return The number of common prefix characters with the previous sequence. */ private int commonPrefix(byte[] sequence, int start, int len) { // Empty root state case. final int max = Math.min(len, activePathLen); int i; for (i = 0; i < max; i++) { final int lastArc = nextArcOffset[i] - ARC_SIZE; if (sequence[start++] != getArcLabel(lastArc)) { break; } } return i; } /** * Freeze a state: try to find an equivalent state in the interned states dictionary first, if * found, return it, otherwise, serialize the mutable state at activePathIndex and * return it. */ private int freezeState(final int activePathIndex) { final int start = activePath[activePathIndex]; final int end = nextArcOffset[activePathIndex]; final int len = end - start; // Set the last arc flag on the current active path's state. serialized[end - ARC_SIZE + FLAGS_OFFSET] |= BIT_ARC_LAST; // Try to locate a state with an identical content in the hash set. final int bucketMask = (hashSet.length - 1); int slot = hash(start, len) & bucketMask; for (int i = 0; ; ) { int state = hashSet[slot]; if (state == 0) { state = hashSet[slot] = serialize(activePathIndex); if (++hashSize > hashSet.length / 2) expandAndRehash(); return state; } else if (equivalent(state, start, len)) { return state; } slot = (slot + (++i)) & bucketMask; } } /** Reallocate and rehash the hash set. */ private void expandAndRehash() { final int[] newHashSet = new int[hashSet.length * 2]; final int bucketMask = (newHashSet.length - 1); for (int j = 0; j < hashSet.length; j++) { final int state = hashSet[j]; if (state > 0) { int slot = hash(state, stateLength(state)) & bucketMask; for (int i = 0; newHashSet[slot] > 0; ) { slot = (slot + (++i)) & bucketMask; } newHashSet[slot] = state; } } this.hashSet = newHashSet; } /** The total length of the serialized state data (all arcs). */ private int stateLength(int state) { int arc = state; while (!isArcLast(arc)) { arc += ARC_SIZE; } return arc - state + ARC_SIZE; } /** Return true if two regions in {@link #serialized} are identical. */ private boolean equivalent(int start1, int start2, int len) { if (start1 + len > size || start2 + len > size) return false; while (len-- > 0) if (serialized[start1++] != serialized[start2++]) return false; return true; } /** Serialize a given state on the active path. */ private int serialize(final int activePathIndex) { expandBuffers(); final int newState = size; final int start = activePath[activePathIndex]; final int len = nextArcOffset[activePathIndex] - start; System.arraycopy(serialized, start, serialized, newState, len); size += len; return newState; } /** Hash code of a fragment of {@link #serialized} array. */ private int hash(int start, int byteCount) { assert byteCount % ARC_SIZE == 0 : "Not an arc multiply?"; int h = 0; for (int arcs = byteCount / ARC_SIZE; --arcs >= 0; start += ARC_SIZE) { h = 17 * h + getArcLabel(start); h = 17 * h + getArcTarget(start); if (isArcFinal(start)) h += 17; } return h; } /** Append a new mutable state to the active path. */ private void expandActivePath(int size) { if (activePath.length < size) { final int p = activePath.length; activePath = java.util.Arrays.copyOf(activePath, size); nextArcOffset = java.util.Arrays.copyOf(nextArcOffset, size); for (int i = p; i < size; i++) { nextArcOffset[i] = activePath[i] = allocateState(/* assume max labels count */ MAX_LABELS); } } } /** Expand internal buffers for the next state. */ private void expandBuffers() { if (this.serialized.length < size + ARC_SIZE * MAX_LABELS) { serialized = java.util.Arrays.copyOf(serialized, serialized.length + bufferGrowthSize); serializationBufferReallocations++; } } /** * Allocate space for a state with the given number of outgoing labels. * * @return state offset */ private int allocateState(int labels) { expandBuffers(); final int state = size; size += labels * ARC_SIZE; return state; } /** Copy current into an internal buffer. */ private boolean setPrevious(byte[] sequence, int start, int length) { if (previous == null || previous.length < length) { previous = new byte[length]; } System.arraycopy(sequence, start, previous, 0, length); previousLength = length; return true; } /** * Lexicographic order of input sequences. By default, consistent with the "C" sort (absolute * value of bytes, 0-255). */ private static int compare(byte[] s1, int start1, int lens1, byte[] s2, int start2, int lens2) { final int max = Math.min(lens1, lens2); for (int i = 0; i < max; i++) { final byte c1 = s1[start1++]; final byte c2 = s2[start2++]; if (c1 != c2) return (c1 & 0xff) - (c2 & 0xff); } return lens1 - lens2; } } ================================================ FILE: morfologik-fsa-builders/src/main/java/morfologik/fsa/builders/FSAInfo.java ================================================ package morfologik.fsa.builders; import com.carrotsearch.hppc.IntIntHashMap; import java.util.BitSet; import morfologik.fsa.FSA; import morfologik.fsa.FSA5; /** Compute additional information about an FSA: number of arcs, nodes, etc. */ public final class FSAInfo { /** Computes the exact number of states and nodes by recursively traversing the FSA. */ private static class NodeVisitor { final BitSet visitedArcs = new BitSet(); final BitSet visitedNodes = new BitSet(); int nodes; int arcs; int totalArcs; private final FSA fsa; NodeVisitor(FSA fsa) { this.fsa = fsa; } public void visitNode(final int node) { if (visitedNodes.get(node)) { return; } visitedNodes.set(node); nodes++; for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { if (!visitedArcs.get(arc)) { arcs++; } totalArcs++; visitedArcs.set(arc); if (!fsa.isArcTerminal(arc)) { visitNode(fsa.getEndNode(arc)); } } } } /** Computes the exact number of final states. */ private static class FinalStateVisitor { final IntIntHashMap visitedNodes = new IntIntHashMap(); private final FSA fsa; FinalStateVisitor(FSA fsa) { this.fsa = fsa; } public int visitNode(int node) { int index = visitedNodes.indexOf(node); if (index >= 0) { return visitedNodes.indexGet(index); } int fromHere = 0; for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { if (fsa.isArcFinal(arc)) fromHere++; if (!fsa.isArcTerminal(arc)) { fromHere += visitNode(fsa.getEndNode(arc)); } } visitedNodes.put(node, fromHere); return fromHere; } } /** Number of nodes in the automaton. */ public final int nodeCount; /** * Number of arcs in the automaton, excluding an arcs from the zero node (initial) and an arc from * the start node to the root node. */ public final int arcsCount; /** Total number of arcs, counting arcs that physically overlap due to merging. */ public final int arcsCountTotal; /** Number of final states (number of input sequences stored in the automaton). */ public final int finalStatesCount; /** Arcs size (in serialized form). */ public final int size; /* * */ public FSAInfo(FSA fsa) { final NodeVisitor w = new NodeVisitor(fsa); int root = fsa.getRootNode(); if (root > 0) { w.visitNode(root); } this.nodeCount = 1 + w.nodes; this.arcsCount = 1 + w.arcs; this.arcsCountTotal = 1 + w.totalArcs; final FinalStateVisitor fsv = new FinalStateVisitor(fsa); this.finalStatesCount = fsv.visitNode(fsa.getRootNode()); if (fsa instanceof FSA5) { this.size = ((FSA5) fsa).arcs.length; } else { this.size = 0; } } /* * */ public FSAInfo(int nodeCount, int arcsCount, int arcsCountTotal, int finalStatesCount) { this.nodeCount = nodeCount; this.arcsCount = arcsCount; this.arcsCountTotal = arcsCountTotal; this.finalStatesCount = finalStatesCount; this.size = 0; } /* * */ @Override public String toString() { return "Nodes: " + nodeCount + ", arcs visited: " + arcsCount + ", arcs total: " + arcsCountTotal + ", final states: " + finalStatesCount + ", size: " + size; } } ================================================ FILE: morfologik-fsa-builders/src/main/java/morfologik/fsa/builders/FSASerializer.java ================================================ package morfologik.fsa.builders; import java.io.IOException; import java.io.OutputStream; import java.util.Set; import morfologik.fsa.FSA; import morfologik.fsa.FSAFlags; /** All FSA serializers (to binary formats) will implement this interface. */ public interface FSASerializer { /** * Serialize a finite state automaton to an output stream. * * @param fsa The automaton to serialize. * @param os The output stream to serialize to. * @param A subclass of {@link OutputStream}, returned for chaining. * @return Returns T for chaining. * @throws IOException Rethrown if an I/O error occurs. */ public T serialize(FSA fsa, T os) throws IOException; /** * @return Returns the set of flags supported by the serializer (and the output automaton). */ public Set getFlags(); /** * Sets the filler separator (only if {@link #getFlags()} returns {@link FSAFlags#SEPARATORS}). * * @param filler The filler separator byte. * @return Returns this for call chaining. */ public FSASerializer withFiller(byte filler); /** * Sets the annotation separator (only if {@link #getFlags()} returns {@link * FSAFlags#SEPARATORS}). * * @param annotationSeparator The filler separator byte. * @return Returns this for call chaining. */ public FSASerializer withAnnotationSeparator(byte annotationSeparator); /** * Enables support for right language count on nodes, speeding up perfect hash counts (only if * {@link #getFlags()} returns {@link FSAFlags#NUMBERS}). * * @return Returns this for call chaining. */ public FSASerializer withNumbers(); } ================================================ FILE: morfologik-fsa-builders/src/main/java/morfologik/fsa/builders/FSAUtils.java ================================================ package morfologik.fsa.builders; import com.carrotsearch.hppc.IntIntHashMap; import java.io.IOException; import java.io.StringWriter; import java.io.Writer; import java.util.BitSet; import java.util.TreeMap; import morfologik.fsa.FSA; import morfologik.fsa.FSA5; import morfologik.fsa.FSAFlags; import morfologik.fsa.StateVisitor; /** Other FSA-related utilities not directly associated with the class hierarchy. */ public final class FSAUtils { public static final class IntIntHolder { public int a; public int b; public IntIntHolder(int a, int b) { this.a = a; this.b = b; } public IntIntHolder() {} } /** * Returns the right-language reachable from a given FSA node, formatted as an input for the * graphviz package (expressed in the dot language). * * @param fsa The automaton to visualize. * @param node Starting node (subgraph will be visualized unless it's the automaton's root node). * @return Returns the dot language description of the automaton. */ public static String toDot(FSA fsa, int node) { try { StringWriter w = new StringWriter(); toDot(w, fsa, node); return w.toString(); } catch (IOException e) { throw new RuntimeException(e); } } /** * Saves the right-language reachable from a given FSA node, formatted as an input for the * graphviz package (expressed in the dot language), to the given writer. * * @param w The writer to write dot language description of the automaton. * @param fsa The automaton to visualize. * @param node Starting node (subgraph will be visualized unless it's the automaton's root node). * @throws IOException Rethrown if an I/O exception occurs. */ public static void toDot(Writer w, FSA fsa, int node) throws IOException { w.write("digraph Automaton {\n"); w.write(" rankdir = LR;\n"); final BitSet visited = new BitSet(); w.write(" stop [shape=doublecircle,label=\"\"];\n"); w.write(" initial [shape=plaintext,label=\"\"];\n"); w.write(" initial -> " + node + "\n\n"); visitNode(w, 0, fsa, node, visited); w.write("}\n"); } private static void visitNode(Writer w, int d, FSA fsa, int s, BitSet visited) throws IOException { visited.set(s); w.write(" "); w.write(Integer.toString(s)); if (fsa.getFlags().contains(FSAFlags.NUMBERS)) { int nodeNumber = fsa.getRightLanguageCount(s); w.write(" [shape=circle,label=\"" + nodeNumber + "\"];\n"); } else { w.write(" [shape=circle,label=\"\"];\n"); } for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) { w.write(" "); w.write(Integer.toString(s)); w.write(" -> "); if (fsa.isArcTerminal(arc)) { w.write("stop"); } else { w.write(Integer.toString(fsa.getEndNode(arc))); } final byte label = fsa.getArcLabel(arc); w.write(" [label=\""); if (Character.isLetterOrDigit(label)) w.write((char) label); else { w.write("0x"); w.write(Integer.toHexString(label & 0xFF)); } w.write("\""); if (fsa.isArcFinal(arc)) w.write(" arrowhead=\"tee\""); if (fsa instanceof FSA5) { if (((FSA5) fsa).isNextSet(arc)) { w.write(" color=\"blue\""); } } w.write("]\n"); } for (int arc = fsa.getFirstArc(s); arc != 0; arc = fsa.getNextArc(arc)) { if (!fsa.isArcTerminal(arc)) { int endNode = fsa.getEndNode(arc); if (!visited.get(endNode)) { visitNode(w, d + 1, fsa, endNode, visited); } } } } /** * Calculate fan-out ratio (how many nodes have a given number of outgoing arcs). * * @param fsa The automaton to calculate fanout for. * @param root The starting node for calculations. * @return The returned map contains keys for the number of outgoing arcs and an associated value * being the number of nodes with that arc number. */ public static TreeMap calculateFanOuts(final FSA fsa, int root) { final int[] result = new int[256]; fsa.visitInPreOrder( new StateVisitor() { public boolean accept(int state) { int count = 0; for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { count++; } result[count]++; return true; } }); TreeMap output = new TreeMap(); int low = 1; // Omit #0, there is always a single node like that (dummy). while (low < result.length && result[low] == 0) { low++; } int high = result.length - 1; while (high >= 0 && result[high] == 0) { high--; } for (int i = low; i <= high; i++) { output.put(i, result[i]); } return output; } /** * Calculate the size of "right language" for each state in an FSA. The right language is the * number of sequences encoded from a given node in the automaton. * * @param fsa The automaton to calculate right language for. * @return Returns a map with node identifiers as keys and their right language counts as * associated values. */ public static IntIntHashMap rightLanguageForAllStates(final FSA fsa) { final IntIntHashMap numbers = new IntIntHashMap(); fsa.visitInPostOrder( new StateVisitor() { public boolean accept(int state) { int thisNodeNumber = 0; for (int arc = fsa.getFirstArc(state); arc != 0; arc = fsa.getNextArc(arc)) { thisNodeNumber += (fsa.isArcFinal(arc) ? 1 : 0) + (fsa.isArcTerminal(arc) ? 0 : numbers.get(fsa.getEndNode(arc))); } numbers.put(state, thisNodeNumber); return true; } }); return numbers; } } ================================================ FILE: morfologik-fsa-builders/src/test/java/morfologik/fsa/builders/CFSA2SerializerTest.java ================================================ package morfologik.fsa.builders; /** */ public class CFSA2SerializerTest extends SerializerTestBase { protected CFSA2Serializer createSerializer() { return new CFSA2Serializer(); } } ================================================ FILE: morfologik-fsa-builders/src/test/java/morfologik/fsa/builders/FSA5SerializerTest.java ================================================ package morfologik.fsa.builders; /** */ public class FSA5SerializerTest extends SerializerTestBase { protected FSA5Serializer createSerializer() { return new FSA5Serializer(); } } ================================================ FILE: morfologik-fsa-builders/src/test/java/morfologik/fsa/builders/FSA5Test.java ================================================ package morfologik.fsa.builders; import static morfologik.fsa.FSAFlags.*; import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import morfologik.fsa.FSA; import morfologik.fsa.FSA5; import morfologik.fsa.FSAFlags; import org.junit.jupiter.api.Test; /** Additional tests for {@link FSA5}. */ public final class FSA5Test extends TestBase { public List expected = Arrays.asList("a", "aba", "ac", "b", "ba", "c"); @Test public void testVersion5() throws IOException { final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa")); assertFalse(fsa.getFlags().contains(FSAFlags.NUMBERS)); verifyContent(expected, fsa); } @Test public void testVersion5WithNumbers() throws IOException { final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa")); verifyContent(expected, fsa); assertTrue(fsa.getFlags().contains(FSAFlags.NUMBERS)); } @Test public void testArcsAndNodes() throws IOException { final FSA fsa1 = FSA.read(this.getClass().getResourceAsStream("abc.fsa")); final FSA fsa2 = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa")); FSAInfo info1 = new FSAInfo(fsa1); FSAInfo info2 = new FSAInfo(fsa2); assertEquals(info1.arcsCount, info2.arcsCount); assertEquals(info1.nodeCount, info2.nodeCount); assertEquals(4, info2.nodeCount); assertEquals(7, info2.arcsCount); } @Test public void testNumbers() throws IOException { final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc-numbers.fsa")); assertTrue(fsa.getFlags().contains(NEXTBIT)); // Get all numbers for nodes. byte[] buffer = new byte[128]; final ArrayList result = new ArrayList(); walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result); Collections.sort(result); assertEquals(Arrays.asList("0 c", "1 b", "2 ba", "3 a", "4 ac", "5 aba"), result); } public static void walkNode( byte[] buffer, int depth, FSA fsa, int node, int cnt, List result) throws IOException { for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { buffer[depth] = fsa.getArcLabel(arc); if (fsa.isArcFinal(arc) || fsa.isArcTerminal(arc)) { result.add(cnt + " " + new String(buffer, 0, depth + 1, "UTF-8")); } if (fsa.isArcFinal(arc)) { cnt++; } if (!fsa.isArcTerminal(arc)) { walkNode(buffer, depth + 1, fsa, fsa.getEndNode(arc), cnt, result); cnt += fsa.getRightLanguageCount(fsa.getEndNode(arc)); } } } private static void verifyContent(List expected, FSA fsa) throws IOException { final ArrayList actual = new ArrayList(); int count = 0; for (ByteBuffer bb : fsa.getSequences()) { assertEquals(0, bb.arrayOffset()); assertEquals(0, bb.position()); actual.add(new String(bb.array(), 0, bb.remaining(), "UTF-8")); count++; } assertEquals(expected.size(), count); Collections.sort(actual); assertEquals(expected, actual); } } ================================================ FILE: morfologik-fsa-builders/src/test/java/morfologik/fsa/builders/FSABuilderTest.java ================================================ package morfologik.fsa.builders; import static morfologik.fsa.builders.FSATestUtils.*; import static org.junit.jupiter.api.Assertions.assertEquals; import java.io.IOException; import java.util.Arrays; import java.util.Random; import morfologik.fsa.FSA; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; public class FSABuilderTest extends TestBase { private static byte[][] input; private static byte[][] input2; @BeforeAll public static void prepareByteInput(Random rnd) { input = generateRandom(rnd, 25000, new MinMax(1, 20), new MinMax(0, 255)); input2 = generateRandom(rnd, 40, new MinMax(1, 20), new MinMax(0, 3)); } @Test public void testEmptyInput() { byte[][] input = {}; checkCorrect(input, FSABuilder.build(input)); } @Test public void testHashResizeBug() throws Exception { byte[][] input = { {0, 1}, {0, 2}, {1, 1}, {2, 1}, }; FSA fsa = FSABuilder.build(input); checkCorrect(input, FSABuilder.build(input)); checkMinimal(fsa); } @Test public void testSmallInput() throws Exception { byte[][] input = { "abc".getBytes("UTF-8"), "bbc".getBytes("UTF-8"), "d".getBytes("UTF-8"), }; checkCorrect(input, FSABuilder.build(input)); } @Test public void testLexicographicOrder() throws IOException { byte[][] input = { {0}, {1}, {(byte) 0xff}, }; Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); // Check if lexical ordering is consistent with absolute byte value. assertEquals(0, input[0][0]); assertEquals(1, input[1][0]); assertEquals((byte) 0xff, input[2][0]); final FSA fsa; checkCorrect(input, fsa = FSABuilder.build(input)); int arc = fsa.getFirstArc(fsa.getRootNode()); assertEquals(0, fsa.getArcLabel(arc)); arc = fsa.getNextArc(arc); assertEquals(1, fsa.getArcLabel(arc)); arc = fsa.getNextArc(arc); assertEquals((byte) 0xff, fsa.getArcLabel(arc)); } @Test public void testRandom25000_largerAlphabet() { FSA fsa = FSABuilder.build(input); checkCorrect(input, fsa); checkMinimal(fsa); } @Test public void testRandom25000_smallAlphabet() throws IOException { FSA fsa = FSABuilder.build(input2); checkCorrect(input2, fsa); checkMinimal(fsa); } } ================================================ FILE: morfologik-fsa-builders/src/test/java/morfologik/fsa/builders/FSATestUtils.java ================================================ package morfologik.fsa.builders; import static org.junit.jupiter.api.Assertions.*; import java.nio.ByteBuffer; import java.util.*; import morfologik.fsa.FSA; import morfologik.fsa.StateVisitor; public class FSATestUtils { /* * Generate a sorted list of random sequences. */ public static byte[][] generateRandom(Random rnd, int count, MinMax length, MinMax alphabet) { final byte[][] input = new byte[count][]; for (int i = 0; i < count; i++) { input[i] = randomByteSequence(rnd, length, alphabet); } Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); return input; } /** Generate a random string. */ private static byte[] randomByteSequence(Random rnd, MinMax length, MinMax alphabet) { byte[] bytes = new byte[length.min + rnd.nextInt(length.range())]; for (int i = 0; i < bytes.length; i++) { bytes[i] = (byte) (alphabet.min + rnd.nextInt(alphabet.range())); } return bytes; } /* * Check if the DFSA is correct with respect to the given input. */ public static void checkCorrect(byte[][] input, FSA fsa) { // (1) All input sequences are in the right language. HashSet rl = new HashSet(); for (ByteBuffer bb : fsa) { rl.add(ByteBuffer.wrap(Arrays.copyOf(bb.array(), bb.remaining()))); } HashSet uniqueInput = new HashSet(); for (byte[] sequence : input) { uniqueInput.add(ByteBuffer.wrap(sequence)); } for (ByteBuffer sequence : uniqueInput) { if (!rl.remove(sequence)) { fail("Not present in the right language: " + SerializerTestBase.toString(sequence)); } } // (2) No other sequence _other_ than the input is in the right language. assertEquals(0, rl.size()); } /* * Check if the DFSA reachable from a given state is minimal. This means no * two states have the same right language. */ public static void checkMinimal(final FSA fsa) { final HashMap stateLanguages = new HashMap(); fsa.visitInPostOrder( new StateVisitor() { private StringBuilder b = new StringBuilder(); public boolean accept(int state) { List rightLanguage = allSequences(fsa, state); Collections.sort(rightLanguage, FSABuilder.LEXICAL_ORDERING); b.setLength(0); for (byte[] seq : rightLanguage) { b.append(Arrays.toString(seq)); b.append(','); } String full = b.toString(); assertFalse( stateLanguages.containsKey(full), "State exists: " + state + " " + full + " " + stateLanguages.get(full)); stateLanguages.put(full, state); return true; } }); } static List allSequences(FSA fsa, int state) { ArrayList seq = new ArrayList(); for (ByteBuffer bb : fsa.getSequences(state)) { seq.add(Arrays.copyOf(bb.array(), bb.remaining())); } return seq; } /* * Check if two FSAs are identical. */ public static void checkIdentical(FSA fsa1, FSA fsa2) { ArrayDeque fromRoot = new ArrayDeque(); checkIdentical( fromRoot, fsa1, fsa1.getRootNode(), new BitSet(), fsa2, fsa2.getRootNode(), new BitSet()); } /* * */ static void checkIdentical( ArrayDeque fromRoot, FSA fsa1, int node1, BitSet visited1, FSA fsa2, int node2, BitSet visited2) { int arc1 = fsa1.getFirstArc(node1); int arc2 = fsa2.getFirstArc(node2); if (visited1.get(node1) != visited2.get(node2)) { throw new RuntimeException( "Two nodes should either be visited or not visited: " + Arrays.toString(fromRoot.toArray()) + " " + " node1: " + node1 + " " + " node2: " + node2); } visited1.set(node1); visited2.set(node2); TreeSet labels1 = new TreeSet(); TreeSet labels2 = new TreeSet(); while (true) { labels1.add((char) fsa1.getArcLabel(arc1)); labels2.add((char) fsa2.getArcLabel(arc2)); arc1 = fsa1.getNextArc(arc1); arc2 = fsa2.getNextArc(arc2); if (arc1 == 0 || arc2 == 0) { if (arc1 != arc2) { throw new RuntimeException( "Different number of labels at path: " + Arrays.toString(fromRoot.toArray())); } break; } } if (!labels1.equals(labels2)) { throw new RuntimeException( "Different sets of labels at path: " + Arrays.toString(fromRoot.toArray()) + ":\n" + labels1 + "\n" + labels2); } // recurse. for (char chr : labels1) { byte label = (byte) chr; fromRoot.push( Character.isLetterOrDigit(chr) ? Character.toString(chr) : Integer.toString(chr)); arc1 = fsa1.getArc(node1, label); arc2 = fsa2.getArc(node2, label); if (fsa1.isArcFinal(arc1) != fsa2.isArcFinal(arc2)) { throw new RuntimeException( "Different final flag on arcs at: " + Arrays.toString(fromRoot.toArray()) + ", label: " + label); } if (fsa1.isArcTerminal(arc1) != fsa2.isArcTerminal(arc2)) { throw new RuntimeException( "Different terminal flag on arcs at: " + Arrays.toString(fromRoot.toArray()) + ", label: " + label); } if (!fsa1.isArcTerminal(arc1)) { checkIdentical( fromRoot, fsa1, fsa1.getEndNode(arc1), visited1, fsa2, fsa2.getEndNode(arc2), visited2); } fromRoot.pop(); } } } ================================================ FILE: morfologik-fsa-builders/src/test/java/morfologik/fsa/builders/FSATraversalTest.java ================================================ package morfologik.fsa.builders; import static java.nio.charset.StandardCharsets.*; import static morfologik.fsa.MatchResult.*; import static org.junit.jupiter.api.Assertions.*; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; import java.util.HashSet; import morfologik.fsa.FSA; import morfologik.fsa.FSA5; import morfologik.fsa.FSATraversal; import morfologik.fsa.MatchResult; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; /** Tests {@link FSATraversal}. */ public final class FSATraversalTest extends TestBase { private FSA fsa; @BeforeEach public void setUp() throws Exception { fsa = FSA.read(this.getClass().getResourceAsStream("en_tst.dict")); } @Test public void testAutomatonHasPrefixBug() throws Exception { FSA fsa = FSABuilder.build( Arrays.asList( "a".getBytes(UTF_8), "ab".getBytes(UTF_8), "abc".getBytes(UTF_8), "ad".getBytes(UTF_8), "bcd".getBytes(UTF_8), "bce".getBytes(UTF_8))); FSATraversal fsaTraversal = new FSATraversal(fsa); assertEquals(EXACT_MATCH, fsaTraversal.match("a".getBytes(UTF_8)).kind); assertEquals(EXACT_MATCH, fsaTraversal.match("ab".getBytes(UTF_8)).kind); assertEquals(EXACT_MATCH, fsaTraversal.match("abc".getBytes(UTF_8)).kind); assertEquals(EXACT_MATCH, fsaTraversal.match("ad".getBytes(UTF_8)).kind); assertEquals(SEQUENCE_IS_A_PREFIX, fsaTraversal.match("b".getBytes(UTF_8)).kind); assertEquals(SEQUENCE_IS_A_PREFIX, fsaTraversal.match("bc".getBytes(UTF_8)).kind); MatchResult m; m = fsaTraversal.match("abcd".getBytes(UTF_8)); assertEquals(AUTOMATON_HAS_PREFIX, m.kind); assertEquals(3, m.index); m = fsaTraversal.match("ade".getBytes(UTF_8)); assertEquals(AUTOMATON_HAS_PREFIX, m.kind); assertEquals(2, m.index); m = fsaTraversal.match("ax".getBytes(UTF_8)); assertEquals(AUTOMATON_HAS_PREFIX, m.kind); assertEquals(1, m.index); assertEquals(NO_MATCH, fsaTraversal.match("d".getBytes(UTF_8)).kind); } @Test public void testTraversalWithIterable() { int count = 0; for (ByteBuffer bb : fsa.getSequences()) { assertEquals(0, bb.arrayOffset()); assertEquals(0, bb.position()); count++; } assertEquals(346773, count); } @Test public void testPerfectHash() throws IOException { byte[][] input = new byte[][] { {'a'}, {'a', 'b', 'a'}, {'a', 'c'}, {'b'}, {'b', 'a'}, {'c'}, }; Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); FSA s = FSABuilder.build(input); final byte[] fsaData = new FSA5Serializer().withNumbers().serialize(s, new ByteArrayOutputStream()).toByteArray(); final FSA5 fsa = FSA.read(new ByteArrayInputStream(fsaData), FSA5.class); final FSATraversal traversal = new FSATraversal(fsa); int i = 0; for (byte[] seq : input) { Assertions.assertEquals(i++, traversal.perfectHash(seq)); } // Check if the total number of sequences is encoded at the root node. assertEquals(6, fsa.getRightLanguageCount(fsa.getRootNode())); // Check sub/super sequence scenarios. assertEquals(AUTOMATON_HAS_PREFIX, traversal.perfectHash("abax".getBytes(UTF_8))); assertEquals(AUTOMATON_HAS_PREFIX, traversal.perfectHash("abx".getBytes(UTF_8))); assertEquals(SEQUENCE_IS_A_PREFIX, traversal.perfectHash("ab".getBytes(UTF_8))); assertEquals(NO_MATCH, traversal.perfectHash("d".getBytes(UTF_8))); assertEquals(NO_MATCH, traversal.perfectHash(new byte[] {0})); assertTrue(AUTOMATON_HAS_PREFIX < 0); assertTrue(SEQUENCE_IS_A_PREFIX < 0); assertTrue(NO_MATCH < 0); } /** */ @Test public void testRecursiveTraversal() { final int[] counter = new int[] {0}; class Recursion { public void dumpNode(final int node) { int arc = fsa.getFirstArc(node); do { if (fsa.isArcFinal(arc)) { counter[0]++; } if (!fsa.isArcTerminal(arc)) { dumpNode(fsa.getEndNode(arc)); } arc = fsa.getNextArc(arc); } while (arc != 0); } } new Recursion().dumpNode(fsa.getRootNode()); assertEquals(346773, counter[0]); } @Test public void testMatch() throws IOException { final FSA fsa = FSA.read(this.getClass().getResourceAsStream("abc.fsa")); final FSATraversal traversalHelper = new FSATraversal(fsa); MatchResult m = traversalHelper.match("ax".getBytes()); assertEquals(AUTOMATON_HAS_PREFIX, m.kind); assertEquals(1, m.index); assertEquals(new HashSet(Arrays.asList("ba", "c")), suffixes(fsa, m.node)); assertEquals(EXACT_MATCH, traversalHelper.match("aba".getBytes()).kind); m = traversalHelper.match("abalonger".getBytes()); assertEquals(AUTOMATON_HAS_PREFIX, m.kind); assertEquals("longer", "abalonger".substring(m.index)); m = traversalHelper.match("ab".getBytes()); assertEquals(SEQUENCE_IS_A_PREFIX, m.kind); assertEquals(new HashSet(Arrays.asList("a")), suffixes(fsa, m.node)); } /** Return all sequences reachable from a given node, as strings. */ private HashSet suffixes(FSA fsa, int node) { HashSet result = new HashSet(); for (ByteBuffer bb : fsa.getSequences(node)) { result.add(new String(bb.array(), bb.position(), bb.remaining(), UTF_8)); } return result; } } ================================================ FILE: morfologik-fsa-builders/src/test/java/morfologik/fsa/builders/MinMax.java ================================================ package morfologik.fsa.builders; /** Minimum/maximum and range. */ final class MinMax { public final int min; public final int max; MinMax(int min, int max) { this.min = Math.min(min, max); this.max = Math.max(min, max); } public int range() { return max - min; } } ================================================ FILE: morfologik-fsa-builders/src/test/java/morfologik/fsa/builders/SerializerTestBase.java ================================================ package morfologik.fsa.builders; import static morfologik.fsa.FSAFlags.*; import static org.junit.jupiter.api.Assertions.*; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import morfologik.fsa.FSA; import morfologik.fsa.FSAFlags; import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.Test; public abstract class SerializerTestBase extends TestBase { @Test public void testA() throws IOException { byte[][] input = new byte[][] { {'a'}, }; Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); FSA s = FSABuilder.build(input); checkSerialization(input, s); } @Test public void testArcsSharing() throws IOException { byte[][] input = new byte[][] { {'a', 'c', 'f'}, {'a', 'd', 'g'}, {'a', 'e', 'h'}, {'b', 'd', 'g'}, {'b', 'e', 'h'}, }; Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); FSA s = FSABuilder.build(input); checkSerialization(input, s); } @Test public void testFSA5SerializerSimple() throws IOException { byte[][] input = new byte[][] { {'a'}, {'a', 'b', 'a'}, {'a', 'c'}, {'b'}, {'b', 'a'}, {'c'}, }; Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); FSA s = FSABuilder.build(input); checkSerialization(input, s); } @Test public void testNotMinimal() throws IOException { byte[][] input = new byte[][] { {'a', 'b', 'a'}, {'b'}, {'b', 'a'} }; Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); FSA s = FSABuilder.build(input); checkSerialization(input, s); } @Test public void testFSA5Bug0() throws IOException { checkCorrect( new String[] { "3-D+A+JJ", "3-D+A+NN", "4-F+A+NN", "z+A+NN", }); } @Test public void testFSA5Bug1() throws IOException { checkCorrect( new String[] { "+NP", "n+N", "n+NP", }); } private void checkCorrect(String[] strings) throws IOException { byte[][] input = new byte[strings.length][]; for (int i = 0; i < strings.length; i++) { input[i] = strings[i].getBytes("ISO8859-1"); } Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); FSA s = FSABuilder.build(input); checkSerialization(input, s); } @Test public void testEmptyInput() throws IOException { byte[][] input = new byte[][] {}; FSA s = FSABuilder.build(input); checkSerialization(input, s); } @Test public void test_abc() throws IOException { testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("abc.fsa"))); } @Test public void test_minimal() throws IOException { testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal.fsa"))); } @Test public void test_minimal2() throws IOException { testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("minimal2.fsa"))); } @Test public void test_en_tst() throws IOException { testBuiltIn(FSA.read(FSA5Test.class.getResourceAsStream("en_tst.dict"))); } private void testBuiltIn(FSA fsa) throws IOException { final ArrayList sequences = new ArrayList(); sequences.clear(); for (ByteBuffer bb : fsa) { sequences.add(Arrays.copyOf(bb.array(), bb.remaining())); } Collections.sort(sequences, FSABuilder.LEXICAL_ORDERING); final byte[][] in = sequences.toArray(new byte[sequences.size()][]); FSA root = FSABuilder.build(in); // Check if the DFSA is correct first. FSATestUtils.checkCorrect(in, root); // Check serialization. checkSerialization(in, root); } private void checkSerialization(byte[][] input, FSA root) throws IOException { checkSerialization0(createSerializer(), input, root); if (createSerializer().getFlags().contains(FSAFlags.NUMBERS)) { checkSerialization0(createSerializer().withNumbers(), input, root); } } private void checkSerialization0(FSASerializer serializer, final byte[][] in, FSA root) throws IOException { final byte[] fsaData = serializer.serialize(root, new ByteArrayOutputStream()).toByteArray(); FSA fsa = FSA.read(new ByteArrayInputStream(fsaData)); checkCorrect(in, fsa); } /* * Check if the FSA is correct with respect to the given input. */ protected void checkCorrect(byte[][] input, FSA fsa) { // (1) All input sequences are in the right language. HashSet rl = new HashSet(); for (ByteBuffer bb : fsa) { byte[] array = bb.array(); int length = bb.remaining(); rl.add(ByteBuffer.wrap(Arrays.copyOf(array, length))); } HashSet uniqueInput = new HashSet(); for (byte[] sequence : input) { uniqueInput.add(ByteBuffer.wrap(sequence)); } for (ByteBuffer sequence : uniqueInput) { if (!rl.remove(sequence)) { fail("Not present in the right language: " + toString(sequence)); } } // (2) No other sequence _other_ than the input is in the right // language. assertEquals(0, rl.size()); } @Test public void testAutomatonWithNodeNumbers() throws IOException { Assumptions.assumeTrue(createSerializer().getFlags().contains(FSAFlags.NUMBERS)); byte[][] input = new byte[][] { {'a'}, {'a', 'b', 'a'}, {'a', 'c'}, {'b'}, {'b', 'a'}, {'c'}, }; Arrays.sort(input, FSABuilder.LEXICAL_ORDERING); FSA s = FSABuilder.build(input); final byte[] fsaData = createSerializer().withNumbers().serialize(s, new ByteArrayOutputStream()).toByteArray(); FSA fsa = FSA.read(new ByteArrayInputStream(fsaData)); // Ensure we have the NUMBERS flag set. assertTrue(fsa.getFlags().contains(NUMBERS)); // Get all numbers from nodes. byte[] buffer = new byte[128]; final ArrayList result = new ArrayList(); FSA5Test.walkNode(buffer, 0, fsa, fsa.getRootNode(), 0, result); Collections.sort(result); assertEquals(Arrays.asList("0 a", "1 aba", "2 ac", "3 b", "4 ba", "5 c"), result); } protected abstract FSASerializer createSerializer(); /* * Drain bytes from a byte buffer to a string. */ public static String toString(ByteBuffer sequence) { byte[] bytes = new byte[sequence.remaining()]; sequence.get(bytes); return Arrays.toString(bytes); } } ================================================ FILE: morfologik-fsa-builders/src/test/java/morfologik/fsa/builders/TestBase.java ================================================ package morfologik.fsa.builders; import com.carrotsearch.randomizedtesting.jupiter.DetectThreadLeaks; import com.carrotsearch.randomizedtesting.jupiter.Randomized; import java.util.function.Predicate; @Randomized @DetectThreadLeaks(scope = DetectThreadLeaks.Scope.SUITE) @DetectThreadLeaks.LingerTime(millis = 5 * 1000) @DetectThreadLeaks.ExcludeThreads(TestBase.CustomThreadFilter.class) public abstract class TestBase { /** Any custom thread filters we should ignore. */ public static class CustomThreadFilter implements Predicate { @Override public boolean test(Thread t) { // IBM J9 bogus threads. String threadName = t.getName(); if ("Attach API wait loop".equals(threadName) || "file lock watchdog".equals(threadName) || "ClassCache Reaper".equals(threadName)) { return true; } return false; } } } ================================================ FILE: morfologik-fsa-builders/src/test/resources/morfologik/fsa/builders/abc.in ================================================ a aba ac b ba c ================================================ FILE: morfologik-fsa-builders/src/test/resources/morfologik/fsa/builders/minimal.in ================================================ +NP n+N n+NP ================================================ FILE: morfologik-fsa-builders/src/test/resources/morfologik/fsa/builders/minimal2.in ================================================ 3-D+A+JJ 3-D+A+NN 4-F+A+NN 4-H+A+JJ z+A+NN z-axis+A+NN zB+A+NN zZt+A+NNP za-zen+A+NN zabaglione+A+NN zabagliones+B+NNS zabajone+A+NN zabajones+B+NNS zabaione+A+NN zabaiones+B+NNS zabra+A+NN zabras+B+NNS zack+A+NN zacaton+A+NN zacatons+B+NNS zacatun+A+NN zaddik+A+NN zaddiks+B+NNS zaffar+A+NN ================================================ FILE: morfologik-polish/pom.xml ================================================ 4.0.0 org.carrot2 morfologik-parent 2.2.0-SNAPSHOT ../pom.xml morfologik-polish bundle Morfologik Stemming (Polish Dictionary) Morfologik Stemming (Polish Dictionary) ../etc/forbidden-apis/signatures.txt org.carrot2.morfologik.polish org.carrot2 morfologik-stemming ${project.version} org.apache.felix maven-bundle-plugin morfologik.stemming.polish * ================================================ FILE: morfologik-polish/src/main/java/morfologik/stemming/polish/PolishStemmer.java ================================================ package morfologik.stemming.polish; import java.io.IOException; import java.net.URL; import java.security.AccessController; import java.security.PrivilegedActionException; import java.security.PrivilegedExceptionAction; import java.util.Iterator; import java.util.List; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.IStemmer; import morfologik.stemming.WordData; /** * A dictionary-based stemmer for the Polish language. Instances of this class are not thread safe. * * @see morfologik.stemming.DictionaryLookup */ public final class PolishStemmer implements IStemmer, Iterable { /** The underlying dictionary, loaded once (lazily). */ private static Dictionary dictionary; /** Dictionary lookup delegate. */ private final DictionaryLookup lookup; public PolishStemmer() { synchronized (getClass()) { if (dictionary == null) { try { dictionary = AccessController.doPrivileged( new PrivilegedExceptionAction() { @Override public Dictionary run() throws Exception { URL dictResource = getClass().getResource("polish.dict"); if (dictResource == null) { throw new IOException("Polish dictionary resource not found."); } return Dictionary.read(dictResource); } }); } catch (PrivilegedActionException e) { throw new RuntimeException("Could not read dictionary data.", e.getException()); } } } lookup = new DictionaryLookup(dictionary); } /** * @return Return the underlying {@link Dictionary} driving the stemmer. */ public Dictionary getDictionary() { return dictionary; } /** {@inheritDoc} */ public List lookup(CharSequence word) { return lookup.lookup(word); } /** Iterates over all dictionary forms stored in this stemmer. */ public Iterator iterator() { return lookup.iterator(); } } ================================================ FILE: morfologik-polish/src/main/resources/morfologik/stemming/polish/polish.LICENSE.Polish.txt ================================================ Morfologik VERSION: 2.1 PoliMorf BUILD: 2016-02-13 19:37:51+01:00 GIT: 6e63b53 Copyright (c) 2016, Marcin Miłkowski Wszelkie prawa zastrzeżone Redystrybucja i używanie, czy to w formie kodu źródłowego, czy w formie kodu wykonawczego, są dozwolone pod warunkiem spełnienia poniższych warunków: 1. Redystrybucja kodu źródłowego musi zawierać powyższą notę copyrightową, niniejszą listę warunków oraz poniższe oświadczenie o wyłączeniu odpowiedzialności. 2. Redystrybucja kodu wykonawczego musi zawierać powyższą notę copyrightową, niniejszą listę warunków oraz poniższe oświadczenie o wyłączeniu odpowiedzialności w dokumentacji i/lub w innych materiałach dostarczanych wraz z kopią oprogramowania. TO OPROGRAMOWANIE JEST DOSTARCZONE PRZEZ „TAKIM, JAKIE JEST”. KAŻDA, DOROZUMIANA LUB BEZPOŚREDNIO WYRAŻONA GWARANCJA, NIE WYŁĄCZAJĄC DOROZUMIANEJ GWARANCJI PRZYDATNOŚCI HANDLOWEJ I PRZYDATNOŚCI DO OKREŚLONEGO ZASTOSOWANIA, JEST WYŁĄCZONA. W ŻADNYM WYPADKU NIE MOGĄ BYĆ ODPOWIEDZIALNI ZA JAKIEKOLWIEK BEZPOŚREDNIE, POŚREDNIE, INCYDENTALNE, SPECJALNE, UBOCZNE I WTÓRNE SZKODY (NIE WYŁĄCZAJĄC OBOWIĄZKU DOSTARCZENIA PRODUKTU ZASTĘPCZEGO LUB SERWISU, ODPOWIEDZIALNOŚCI Z TYTUŁU UTRATY WALORÓW UŻYTKOWYCH, UTRATY DANYCH LUB KORZYŚCI, A TAKŻE PRZERW W PRACY PRZEDSIĘBIORSTWA) SPOWODOWANE W JAKIKOLWIEK SPOSÓB I NA PODSTAWIE ISTNIEJĄCEJ W TEORII ODPOWIEDZIALNOŚCI KONTRAKTOWEJ, CAŁKOWITEJ LUB DELIKTOWEJ (WYNIKŁEJ ZARÓWNO Z NIEDBALSTWA JAK INNYCH POSTACI WINY), POWSTAŁE W JAKIKOLWIEK SPOSÓB W WYNIKU UŻYWANIA LUB MAJĄCE ZWIĄZEK Z UŻYWANIEM OPROGRAMOWANIA, NAWET JEŚLI O MOŻLIWOŚCI POWSTANIA TAKICH SZKÓD OSTRZEŻONO. ================================================ FILE: morfologik-polish/src/main/resources/morfologik/stemming/polish/polish.LICENSE.txt ================================================ Morfologik VERSION: 2.1 PoliMorf BUILD: 2016-02-13 19:37:50+01:00 GIT: 6e63b53 Copyright (c) 2016, Marcin Miłkowski All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: morfologik-polish/src/main/resources/morfologik/stemming/polish/polish.README.Polish.txt ================================================ Morfologik to projekt tworzenia polskich słowników morfosyntaktycznych (stąd nazwa) służących do znakowania morfosyntaktycznego i syntezy gramatycznej. WERSJA: 2.1 PoliMorf UTWORZONA: 2016-02-15 17:46:00+01:00 GIT: d3b2fe7 ŹRÓDŁO ====== Dane pochodzą ze słownika sjp.pl oraz słownika PoliMorf i są licencjonowane na licencji zawartej w pliku LICENSE.Polish.txt. Dane źródłowe pochodzą z polskiego słownika ispell, następnie redagowanego na stronach kurnik.pl/slownik i sjp.pl, a także Słownika gramatycznego języka polskiego. Autorzy: (1) ispell: Mirosław Prywata, Piotr Gackiewicz, Włodzimierz Macewicz, Łukasz Szałkiewicz, Marek Futrega. (2) SGJP: Zygmunt Saloni, Włodzimierz Gruszczyński, Marcin Woliński, Robert Wołosz. Wersja PoliMorf została opracowana w ramach projektu CESAR realizowanego w Zespole Inżynierii Lingwistycznej IPI PAN. W przygotowaniu ostatecznej wersji 2.0 dopomogli Jan Szejko i Adam Radziszewski. PLIKI ===== 1. polish.dict oraz polish.info to pliki słownika morfologicznego dla programu morfologik-stemming (zob. [3]), wykorzystywanego również przez projekt LanguageTool (zob. [2]). 2. polish_synth.dict oraz polish_synth.info to pliki słownika syntezy gramatycznej dla LanguageTool (zob. [2]). Aby uzyskać formę odmienioną, należy używać następującej składni "zapytania" do słownika: | Przykład: niemiecki|adjp daje "niemiecku". 3. fsa_morph/polish.dict i fsa_morph/polish_synth.dict to pliki słowników jak powyżej, ale przeznaczone dla programu fsa_morph z pakietu fsa Janka Daciuka (zob. [1]). Słowniki te zawierają te same dane, co słowniki powyżej, różnią się jednak metodą kompresji oraz: - mają separator w automacie ustawiony na sztywno na '+', - mają znaczniki morfosyntaktyczne rozdzielone znakiem '|', - mają kodowanie "prefiksowe", które wymaga podania flagi "-P" do fsa_morph, - znaki diakrytyczne są kodowane w UTF-8 (ma znaczenie, jeśli terminal ma ustawione inne). Przykład: $ echo "krowami" | ./fsa_morph -P -d polish.dict krowami: krowa+subst:pl:inst:f $ echo "zamek" | ./fsa_morph -P -d polish.dict zamek: zamek+subst:sg:acc:m3|subst:sg:nom:m3 Synteza: $ echo "niemiecki|adjp" | ./fsa_morph -P -d polish_synth.dict niemiecki|adjp: niemiecku 4. polimorfologik-2.1 PoliMorf.txt to zwykły plik tekstowy w kodowaniu UTF-8 o formacie: forma podstawowa;forma odmieniona;znaczniki gramatyczne [1] http://www.eti.pg.gda.pl/katedry/kiw/pracownicy/Jan.Daciuk/personal/fsa.html [2] https://languagetool.org/ [3] https://github.com/morfologik/morfologik-stemming ZNACZNIKI MORFOSYNTAKTYCZNE =========================== Zestaw znaczników jest zbliżony do zestawu korpusu NKJP (www.nkjp.pl). * adj - przymiotnik (np. „niemiecki”) * adja - przymiotnik przyprzymiotnikowy (np. „niemiecko”, w wyrażeniach typu „niemiecko-chiński”) * adjc - przymiotnik predykatywny (np. „ciekaw”, „dłużen”) * adjp - przymiotnik poprzyimkowy (np. „niemiecku”) * adv - przysłówek (np. „głupio”) * burk - burkinostka (np. „Burkina Faso”) * depr - forma deprecjatywna * ger - rzeczownik odsłowny * conj - spójnik łączący zdania współrzędne * comp - spójnik wprowadzający zdanie podrzędne * num - liczebnik * pact - imiesłów przymiotnikowy czynny * pant - imiesłów przysłówkowy uprzedni * pcon - imiesłów przysłówkowy współczesny * ppas - imiesłów przymiotnikowy bierny * ppron12 - zaimek nietrzecioosobowy * ppron3 - zaimek trzecioosobowy * pred - predykatyw (np. „trzeba”) * prep - przyimek * siebie - zaimek "siebie" * subst - rzeczownik * verb - czasownik * brev - skrót * interj - wykrzyknienie * qub - kublik (np. „nie” lub „tak”) Atrybuty podstawowych form: * sg / pl - liczba pojedyncza / liczba mnoga * nom - mianownik * gen - dopełniacz * acc - biernik * dat - celownik * inst - narzędnik * loc - miejscownik * voc - wołacz * pos - stopień równy * com - stopień wyższy * sup - stopień najwyższy * m1, m2, m3 - rodzaje męskie * n1, n2 - rodzaje nijakie * p1, p2, p3 - rodzaje rzeczowników mających tylko liczbę mnogą (pluralium tantum) * f - rodzaj żeński * pri - pierwsza osoba * sec - druga osoba * ter - trzecia osoba * aff - forma niezanegowana * neg - forma zanegowana * refl - forma zwrotna czasownika * nonrefl - forma niezwrotna czasownika * refl.nonrefl - forma może być zwrotna lub niezwrotna * perf - czasownik dokonany * imperf - czasownik niedokonany * imperf.perf - czasownik, który może występować zarówno jako dokonany, jak i jako niedokonany * nakc - forma nieakcentowana zaimka (ppron lub siebie) * akc - forma akcentowana zaimka * praep - forma poprzyimkowa * npraep - forma niepoprzyimkowa * ger - rzeczownik odsłowny * imps - forma bezosobowa * impt - tryb rozkazujący * inf - bezokolicznik * fin - forma nieprzeszła * bedzie - forma przyszła "być" * praet - forma przeszła czasownika (pseudoimiesłów) * pot - tryb przypuszczający [nie występuje w znacznikach NKJP] * pun - skrót z kropką [za NKJP] * npun - bez kropki [za NKJP] * wok / nwok: forma wokaliczna / niewokaliczna Uwaga: formy trybu przypuszczającego są jednolicie oznaczone tylko znacznikiem pot, bez znacznika praet. W znacznikach Morfologika nie występuje i nie będzie występować znacznik aglt, a to ze względu na inną zasadę segmentacji wyrazów. ================================================ FILE: morfologik-polish/src/main/resources/morfologik/stemming/polish/polish.README.txt ================================================ Morfologik is a project aiming at generating Polish morphosyntactic dictionaries (hence the name) used for part-of-speech tagging and part-of-speech synthesis. See LICENSE.txt for license restrictions. See README.Polish.txt for more information concerning authorship and dictionary data format. VERSION: 2.1 PoliMorf BUILD: 2016-02-13 19:37:50+01:00 GIT: 6e63b53 ================================================ FILE: morfologik-polish/src/main/resources/morfologik/stemming/polish/polish.info ================================================ # # Morfologik Polish (stemming dictionary) # Version: 2.1 PoliMorf # Date: 2016-02-13 19:32:15+01:00 # Git: 6e63b53 # # Copyright (c) 2016, Marcin Miłkowski # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, this # list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # fsa.dict.author=morfologik.blogspot.com fsa.dict.created=2016-02-13 19:32:15+01:00 fsa.dict.license=BSD. http://morfologik.blogspot.com fsa.dict.separator=; fsa.dict.encoding=UTF-8 fsa.dict.encoder=PREFIX ================================================ FILE: morfologik-polish/src/test/java/morfologik/stemming/polish/Gh27Test.java ================================================ package morfologik.stemming.polish; import java.io.IOException; import java.util.Locale; import morfologik.stemming.WordData; import org.junit.jupiter.api.Test; /* * */ public class Gh27Test { /* */ @Test public void gh27() throws IOException { PolishStemmer stemmer = new PolishStemmer(); String in = "Nie zabrakło oczywiście wpadek. Największym zaskoczeniem okazał się dla nas strój" + " Katarzyny Zielińskiej, której ewidentnie o coś chodziło, ale wciąż nie wiemy o co."; for (String t : in.toLowerCase(new Locale("pl")).split("[\\s\\.\\,]+")) { System.out.println("> '" + t + "'"); for (WordData wd : stemmer.lookup(t)) { System.out.print( " - " + (wd.getStem() == null ? "" : wd.getStem()) + ", " + wd.getTag()); } System.out.println(); } } } ================================================ FILE: morfologik-polish/src/test/java/morfologik/stemming/polish/PolishMorfologikStemmerTest.java ================================================ package morfologik.stemming.polish; import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import morfologik.stemming.IStemmer; import morfologik.stemming.WordData; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; /* * */ public class PolishMorfologikStemmerTest { /* */ @Test public void testLexemes() { PolishStemmer s = new PolishStemmer(); assertEquals("żywotopisarstwo", stem(s, "żywotopisarstwie")[0]); assertEquals("abradować", stem(s, "abradowałoby")[0]); assertArrayEquals( new String[] {"żywotopisarstwo", "subst:sg:loc:n2"}, stem(s, "żywotopisarstwie")); assertArrayEquals(new String[] {"bazia", "subst:pl:inst:f"}, stem(s, "baziami")); // This word is not in the dictionary. assertNoStemFor(s, "martygalski"); } /* */ @Test public void listUniqueTags() { HashSet forms = new HashSet<>(); boolean hadMissing = false; for (WordData wd : new PolishStemmer()) { final CharSequence chs = wd.getTag(); if (chs == null) { System.err.println("Missing tag for: " + wd.getWord()); hadMissing = true; continue; } forms.add(chs.toString()); } Assertions.assertThat(hadMissing).isFalse(); } /* */ @Test public void testWordDataFields() throws IOException { final IStemmer s = new PolishStemmer(); final String word = "liga"; final List response = s.lookup(word); assertEquals(2, response.size()); final HashSet stems = new HashSet(); final HashSet tags = new HashSet(); for (WordData wd : response) { stems.add(wd.getStem().toString()); tags.add(wd.getTag().toString()); assertSame(word, wd.getWord()); } assertTrue(stems.contains("ligać")); assertTrue(stems.contains("liga")); assertTrue(tags.contains("subst:sg:nom:f")); assertTrue(tags.contains("verb:fin:sg:ter:imperf:nonrefl+verb:fin:sg:ter:imperf:refl.nonrefl")); // Repeat to make sure we get the same values consistently. for (WordData wd : response) { stems.contains(wd.getStem().toString()); tags.contains(wd.getTag().toString()); } final String ENCODING = "UTF-8"; // Run the same consistency check for the returned buffers. final ByteBuffer temp = ByteBuffer.allocate(100); for (WordData wd : response) { // Buffer should be copied. final ByteBuffer copy = wd.getStemBytes(null); final String stem = new String( copy.array(), copy.arrayOffset() + copy.position(), copy.remaining(), ENCODING); // The buffer should be present in stems set. Assertions.assertThat(stems.contains(stem)).as(stem).isTrue(); // Buffer large enough to hold the contents. assertSame(temp, wd.getStemBytes(temp)); // The copy and the clone should be identical. assertEquals(0, copy.compareTo(temp)); } for (WordData wd : response) { // Buffer should be copied. final ByteBuffer copy = wd.getTagBytes(null); final String tag = new String( copy.array(), copy.arrayOffset() + copy.position(), copy.remaining(), ENCODING); // The buffer should be present in tags set. Assertions.assertThat(tags.contains(tag)).as(tag).isTrue(); // Buffer large enough to hold the contents. temp.clear(); assertSame(temp, wd.getTagBytes(temp)); // The copy and the clone should be identical. assertEquals(0, copy.compareTo(temp)); } for (WordData wd : response) { // Buffer should be copied. final ByteBuffer copy = wd.getWordBytes(null); assertNotNull(copy); assertEquals(0, copy.compareTo(ByteBuffer.wrap(word.getBytes(ENCODING)))); } } /* */ public static String asString(CharSequence s) { if (s == null) return null; return s.toString(); } /* */ public static String[] stem(IStemmer s, String word) { ArrayList result = new ArrayList<>(); for (WordData wd : s.lookup(word)) { result.add(asString(wd.getStem())); result.add(asString(wd.getTag())); } return result.toArray(new String[result.size()]); } /* */ public static void assertNoStemFor(IStemmer s, String word) { assertArrayEquals(new String[] {}, stem(s, word)); } } ================================================ FILE: morfologik-speller/pom.xml ================================================ 4.0.0 org.carrot2 morfologik-parent 2.2.0-SNAPSHOT ../pom.xml morfologik-speller bundle Morfologik Speller Morfologik Speller ../etc/forbidden-apis/signatures.txt org.carrot2.morfologik.speller org.carrot2 morfologik-stemming ${project.version} org.apache.felix maven-bundle-plugin morfologik.speller * ================================================ FILE: morfologik-speller/src/main/java/morfologik/speller/HMatrix.java ================================================ package morfologik.speller; import java.util.Arrays; /** * Keeps track of already computed values of edit distance. Remarks: To save space, the matrix is * kept in a vector. */ public class HMatrix { private int[] p; /* the vector */ private int rowLength; /* row length of matrix */ int columnHeight; /* column height of matrix */ int editDistance; /* edit distance */ /** * Allocates memory and initializes matrix (constructor). * * @param distance (int) max edit distance allowed for candidates; * @param maxLength (int) max length of words. *

Remarks: See Oflazer. To save space, the matrix is stored as a vector. To save time, * additional rows and columns are added. They are initialized to their distance in the * matrix, so that no bound checking is necessary during access. */ public HMatrix(final int distance, final int maxLength) { rowLength = maxLength + 2; columnHeight = 2 * distance + 3; editDistance = distance; final int size = rowLength * columnHeight; p = new int[size]; init(); } private void init() { final int size = p.length; // Initialize edges of the diagonal band to distance + 1 (i.e. distance too big) for (int i = 0; i < rowLength - editDistance - 1; i++) { p[i] = editDistance + 1; // H(distance + j, j) = distance + 1 p[size - i - 1] = editDistance + 1; // H(i, distance + i) = distance + 1 } // Initialize items H(i,j) with at least one index equal to zero to |i - j| for (int j = 0; j < editDistance + 2; j++) { p[j * rowLength] = editDistance + 1 - j; // H(i=0..distance+1,0)=i p[(j + editDistance + 1) * rowLength + j] = j; // H(0,j=0..distance+1)=j } } public void reset() { Arrays.fill(p, 0); init(); } /** * Provide an item of hMatrix indexed by indices. * * @param i - (int) row number; * @param j - (int) column number. * @return Item H[i][j]. Remarks: H matrix is really simulated. What is needed is * only edit_distance + 2 wideband around the diagonal. In fact this diagonal has * been pushed up to the upper border of the matrix. *

The matrix in the vector looks like this: *

   * 	    +---------------------+
   * 	0   |#####################| j=i-e-1
   * 	1   |                     | j=i-e
   * 	    :                     :
   * 	e+1 |                     | j=i-1
   * 	    +---------------------+
   * 	e+2 |                     | j=i
   * 	    +---------------------+
   * 	e+3 |                     | j=i+1
   * 	    :                     :
   * 	2e+2|                     | j=i+e
   * 	2e+3|#####################| j=i+e+1
   * 	    +---------------------+
   * 
*/ public int get(final int i, final int j) { return p[(j - i + editDistance + 1) * rowLength + j]; } /** * Set an item in hMatrix. No checking for i & j is done. They must be correct. * * @param i - (int) row number; * @param j - (int) column number; * @param val - (int) value to put there. */ public void set(final int i, final int j, final int val) { p[(j - i + editDistance + 1) * rowLength + j] = val; } } ================================================ FILE: morfologik-speller/src/main/java/morfologik/speller/Speller.java ================================================ package morfologik.speller; import static morfologik.fsa.MatchResult.EXACT_MATCH; import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; import java.text.Normalizer; import java.text.Normalizer.Form; import java.util.*; import morfologik.fsa.ByteSequenceIterator; import morfologik.fsa.FSA; import morfologik.fsa.FSATraversal; import morfologik.fsa.MatchResult; import morfologik.stemming.BufferUtils; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.DictionaryMetadata; import morfologik.stemming.UnmappableInputException; /** * Finds spelling suggestions. Implements K. Oflazer's algorithm as described in: Oflazer, Kemal. * 1996. "Error-Tolerant Finite-State Recognition with Applications to Morphological Analysis and * Spelling Correction." Computational Linguistics 22 (1): 73–89. * *

See Jan Daciuk's s_fsa package. */ public class Speller { /** Maximum length of the word to be checked. */ public static final int MAX_WORD_LENGTH = 120; static final int FREQ_RANGES = 'Z' - 'A' + 1; static final int FIRST_RANGE_CODE = 'A'; // less frequent words // FIXME: this is an upper limit for replacement searches, we need // proper tree traversal instead of generation of all possible candidates static final int UPPER_SEARCH_LIMIT = 15; private static final int MIN_WORD_LENGTH = 4; private static final int MAX_RECURSION_LEVEL = 6; private final int editDistance; private int effectEditDistance; // effective edit distance private final HMatrix hMatrix; private char[] candidate; /* current replacement */ private int candLen; private int wordLen; /* length of word being processed */ private char[] wordProcessed; /* word being processed */ /** Replacement pattern with optional start/end anchor. */ private static final class Pattern { final char[] chars; final boolean startAnchor; final boolean endAnchor; Pattern(char[] chars, boolean startAnchor, boolean endAnchor) { this.chars = chars; this.startAnchor = startAnchor; this.endAnchor = endAnchor; } } private Map> replacementsAnyToOne = new HashMap<>(); private Map> replacementsAnyToTwo = new HashMap<>(); /** Keys may carry ^ / $ anchors; values are the replacement strings. */ private Map> replacementsTheRest = new HashMap<>(); private boolean containsSeparators = true; /** Internal reusable buffer for encoding words into byte arrays using {@link #encoder}. */ private ByteBuffer byteBuffer = ByteBuffer.allocate(MAX_WORD_LENGTH); /** Internal reusable buffer for encoding words into byte arrays using {@link #encoder}. */ private CharBuffer charBuffer = CharBuffer.allocate(MAX_WORD_LENGTH); /** Reusable match result. */ private final MatchResult matchResult = new MatchResult(); /** * Features of the compiled dictionary. * * @see DictionaryMetadata */ private final DictionaryMetadata dictionaryMetadata; /** Charset encoder for the FSA. */ private final CharsetEncoder encoder; /** Charset decoder for the FSA. */ private final CharsetDecoder decoder; /** An FSA used for lookups. */ private final FSATraversal matcher; /** FSA's root node. */ private final int rootNode; /** The FSA we are using. */ private final FSA fsa; /** An iterator for walking along the final states of {@link #fsa}. */ private final ByteSequenceIterator finalStatesIterator; public Speller(final Dictionary dictionary) { this(dictionary, 1); } public Speller(final Dictionary dictionary, final int editDistance) { this.editDistance = editDistance; this.hMatrix = new HMatrix(editDistance, MAX_WORD_LENGTH); this.dictionaryMetadata = dictionary.metadata; this.rootNode = dictionary.fsa.getRootNode(); this.fsa = dictionary.fsa; this.matcher = new FSATraversal(fsa); this.finalStatesIterator = new ByteSequenceIterator(fsa, rootNode); if (rootNode == 0) { throw new IllegalArgumentException("Dictionary must have at least the root node."); } if (dictionaryMetadata == null) { throw new IllegalArgumentException("Dictionary metadata must not be null."); } encoder = dictionaryMetadata.getEncoder(); decoder = dictionaryMetadata.getDecoder(); // Multibyte separator will result in an exception here. dictionaryMetadata.getSeparatorAsChar(); this.createReplacementsMaps(); } private static boolean isStartAnchored(String key) { return key.startsWith("^"); } private static boolean isEndAnchored(String key) { return key.endsWith("$"); } private static String stripAnchors(String key) { int start = key.startsWith("^") ? 1 : 0; int end = key.endsWith("$") ? key.length() - 1 : key.length(); return key.substring(start, end); } private void createReplacementsMaps() { for (Map.Entry> entry : dictionaryMetadata.getReplacementPairs().entrySet()) { String rawKey = entry.getKey(); boolean startAnchor = isStartAnchored(rawKey); boolean endAnchor = isEndAnchored(rawKey); String strippedKey = stripAnchors(rawKey); for (String s : entry.getValue()) { // replacements any to one: key is the 1-char replacement target if (s.length() == 1) { Pattern p = new Pattern(strippedKey.toCharArray(), startAnchor, endAnchor); if (!replacementsAnyToOne.containsKey(s.charAt(0))) { List list = new ArrayList<>(); list.add(p); replacementsAnyToOne.put(s.charAt(0), list); } else { replacementsAnyToOne.get(s.charAt(0)).add(p); } } // replacements any to two: key is the 2-char replacement target else if (s.length() == 2) { Pattern p = new Pattern(strippedKey.toCharArray(), startAnchor, endAnchor); if (!replacementsAnyToTwo.containsKey(s)) { List list = new ArrayList<>(); list.add(p); replacementsAnyToTwo.put(s, list); } else { replacementsAnyToTwo.get(s).add(p); } } else { // replacements with longer targets: key keeps anchors for getAllReplacements if (!replacementsTheRest.containsKey(rawKey)) { List list = new ArrayList<>(); list.add(s); replacementsTheRest.put(rawKey, list); } else { replacementsTheRest.get(rawKey).add(s); } } } } } private ByteBuffer charSequenceToBytes(final CharSequence word) throws UnmappableInputException { // Encode word characters into bytes in the same encoding as the FSA's. charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, word.length()); for (int i = 0; i < word.length(); i++) { final char chr = word.charAt(i); charBuffer.put(chr); } charBuffer.flip(); return BufferUtils.charsToBytes(encoder, charBuffer, byteBuffer); } /** * Checks whether the word is misspelled, by performing a series of checks according to properties * of the dictionary. * *

If the flag fsa.dict.speller.ignore-punctuation is set, then all non-alphabetic * characters are considered to be correctly spelled. * *

If the flag fsa.dict.speller.ignore-numbers is set, then all words containing * decimal digits are considered to be correctly spelled. * *

If the flag fsa.dict.speller.ignore-camel-case is set, then all CamelCase words * are considered to be correctly spelled. * *

If the flag fsa.dict.speller.ignore-all-uppercase is set, then all alphabetic * words composed of only uppercase characters are considered to be correctly spelled. * *

Otherwise, the word is checked in the dictionary. If the test fails, and the dictionary does * not perform any case conversions (as set by fsa.dict.speller.convert-case flag), * then the method returns false. In case of case conversions, it is checked whether a non-mixed * case word is found in its lowercase version in the dictionary, and for all-uppercase words, * whether the word is found in the dictionary with the initial uppercase letter. * * @param word - the word to be checked * @return true if the word is misspelled */ public boolean isMisspelled(final String word) { // dictionaries usually do not contain punctuation String wordToCheck = word; if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { wordToCheck = DictionaryLookup.applyReplacements(word, dictionaryMetadata.getInputConversionPairs()); } boolean isAlphabetic = wordToCheck.length() != 1 || isAlphabetic(wordToCheck.charAt(0)); return wordToCheck.length() > 0 && (!dictionaryMetadata.isIgnoringPunctuation() || isAlphabetic) && (!dictionaryMetadata.isIgnoringNumbers() || containsNoDigit(wordToCheck)) && !(dictionaryMetadata.isIgnoringCamelCase() && isCamelCase(wordToCheck)) && !(dictionaryMetadata.isIgnoringAllUppercase() && isAlphabetic && isAllUppercase(wordToCheck)) && !isInDictionary(wordToCheck) && (!dictionaryMetadata.isConvertingCase() || !(!isMixedCase(wordToCheck) && (isInDictionary(wordToCheck.toLowerCase(dictionaryMetadata.getLocale())) || isAllUppercase(wordToCheck) && isInDictionary(initialUppercase(wordToCheck))))); } private CharSequence initialUppercase(final String wordToCheck) { return wordToCheck.substring(0, 1) + wordToCheck.substring(1).toLowerCase(dictionaryMetadata.getLocale()); } /** * Test whether the word is found in the dictionary. * * @param word the word to be tested * @return True if it is found. */ public boolean isInDictionary(final CharSequence word) { try { byteBuffer = charSequenceToBytes(word); } catch (UnmappableInputException e) { return false; } // Try to find a partial match in the dictionary. final MatchResult match = matcher.match(matchResult, byteBuffer.array(), 0, byteBuffer.remaining(), rootNode); // Make sure the word doesn't contain a separator if there is an exact match if (containsSeparators && match.kind == EXACT_MATCH) { containsSeparators = false; for (int i = 0; i < word.length(); i++) { if (word.charAt(i) == dictionaryMetadata.getSeparator()) { containsSeparators = true; break; } } } if (match.kind == EXACT_MATCH && !containsSeparators) { return true; } return containsSeparators && match.kind == SEQUENCE_IS_A_PREFIX && byteBuffer.remaining() > 0 && fsa.getArc(match.node, dictionaryMetadata.getSeparator()) != 0; } /** * Get the frequency value for a word form. It is taken from the first entry with this word form. * * @param word the word to be tested * @return frequency value in range: 0..FREQ_RANGE-1 (0: less frequent). */ public int getFrequency(final CharSequence word) { if (!dictionaryMetadata.isFrequencyIncluded()) { return 0; } final byte separator = dictionaryMetadata.getSeparator(); try { byteBuffer = charSequenceToBytes(word); } catch (UnmappableInputException e) { return 0; } final MatchResult match = matcher.match(matchResult, byteBuffer.array(), 0, byteBuffer.remaining(), rootNode); if (match.kind == SEQUENCE_IS_A_PREFIX) { final int arc = fsa.getArc(match.node, separator); if (arc != 0 && !fsa.isArcFinal(arc)) { finalStatesIterator.restartFrom(fsa.getEndNode(arc)); if (finalStatesIterator.hasNext()) { final ByteBuffer bb = finalStatesIterator.next(); final byte[] ba = bb.array(); final int bbSize = bb.remaining(); // the last byte contains the frequency after a separator return ba[bbSize - 1] - FIRST_RANGE_CODE; } } } return 0; } /** * Propose suggestions for misspelled run-on words. This algorithm is inspired by spell.cc in * s_fsa package by Jan Daciuk. * * @param original The original misspelled word. * @return The list of suggested pairs, as CandidateData with space-concatenated strings. */ public List replaceRunOnWordCandidates(final String original) { final List candidates = new ArrayList<>(); String wordToCheck = original; if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { wordToCheck = DictionaryLookup.applyReplacements( original, dictionaryMetadata.getInputConversionPairs()); } if (!isInDictionary(wordToCheck) && dictionaryMetadata.isSupportingRunOnWords()) { Locale locale = dictionaryMetadata.getLocale(); for (int i = 1; i < wordToCheck.length(); i++) { // chop from left to right final String prefix = wordToCheck.substring(0, i); final String suffix = wordToCheck.substring(i); if (isInDictionary(suffix) // camel case words: e.g. GreatElephant || (!isNotCapitalizedWord(suffix) && isInDictionary(suffix.toLowerCase(locale)))) { if (isInDictionary(prefix)) { addReplacement(candidates, prefix + " " + suffix); } else if (Character.isUpperCase(prefix.charAt(0)) && isInDictionary(prefix.toLowerCase(locale))) { // a word that's uppercase just because used at sentence start addReplacement(candidates, prefix + " " + suffix); } } } } return candidates; } /** * Propose suggestions for misspelled run-on words. This algorithm is inspired by spell.cc in * s_fsa package by Jan Daciuk. * * @param original The original misspelled word. * @return The list of suggested pairs, as space-concatenated strings. */ public List replaceRunOnWords(final String original) { final List candidateData = replaceRunOnWordCandidates(original); final List candidates = new ArrayList<>(); for (CandidateData candidate : candidateData) { candidates.add(candidate.word); } return candidates; } private void addReplacement(List candidates, String replacement) { if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) { candidates.add(new CandidateData(replacement, 1)); } else { candidates.add( new CandidateData( DictionaryLookup.applyReplacements( replacement, dictionaryMetadata.getOutputConversionPairs()), 1)); } } /** * Find similar words even if the original word is a correct word that exists in the dictionary * * @param word The original word. * @return A list of suggested candidate replacements. */ public ArrayList findSimilarWordCandidates(String word) { return findReplacementCandidates(word, true); } public ArrayList findSimilarWords(String word) { final List result = findSimilarWordCandidates(word); final ArrayList resultSuggestions = new ArrayList<>(result.size()); for (CandidateData cd : result) { resultSuggestions.add(cd.getWord()); } return resultSuggestions; } /** * Find suggestions by using K. Oflazer's algorithm. See Jan Daciuk's s_fsa package, spell.cc for * further explanation. * * @param word The original misspelled word. * @return A list of suggested replacements. */ public ArrayList findReplacements(String word) { final List result = findReplacementCandidates(word); final ArrayList resultSuggestions = new ArrayList<>(result.size()); for (CandidateData cd : result) { resultSuggestions.add(cd.getWord()); } return resultSuggestions; } /** * Find and return suggestions by using K. Oflazer's algorithm. See Jan Daciuk's s_fsa package, * spell.cc for further explanation. This method is identical to {@link #findReplacements}, but * returns candidate terms with their edit distance scores. * * @param word The original misspelled word. * @return A list of suggested candidate replacements. */ public ArrayList findReplacementCandidates(String word) { return findReplacementCandidates(word, false); } private ArrayList findReplacementCandidates( String word, boolean evenIfWordInDictionary) { hMatrix.reset(); if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { word = DictionaryLookup.applyReplacements(word, dictionaryMetadata.getInputConversionPairs()); } // candidate strings, including same additional data such as edit distance from the original // word. List candidates = new ArrayList<>(); if (word.length() > 0 && word.length() < MAX_WORD_LENGTH && (!isInDictionary(word) || evenIfWordInDictionary)) { List wordsToCheck = new ArrayList<>(); if (replacementsTheRest != null && word.length() > 1) { for (final String wordChecked : getAllReplacements(word, 0, 0)) { if (isInDictionary(wordChecked)) { candidates.add(new CandidateData(wordChecked, 0)); } else { String lowerWord = wordChecked.toLowerCase(dictionaryMetadata.getLocale()); String upperWord = wordChecked.toUpperCase(dictionaryMetadata.getLocale()); if (isInDictionary(lowerWord)) { // add the word as it is in the dictionary, not mixed-case versions of it candidates.add(new CandidateData(lowerWord, 0)); } if (isInDictionary(upperWord)) { candidates.add(new CandidateData(upperWord, 0)); } if (lowerWord.length() > 1) { String firstUpperWord = Character.toUpperCase(lowerWord.charAt(0)) + lowerWord.substring(1); if (isInDictionary(firstUpperWord)) { candidates.add(new CandidateData(firstUpperWord, 0)); } } } wordsToCheck.add(wordChecked); } } else { wordsToCheck.add(word); } // Even if a candidate was found with the replacement pairs (which are usual errors), // there might be more good candidates (see issue #94): int i = 1; for (final String wordChecked : wordsToCheck) { i++; if (i > UPPER_SEARCH_LIMIT) { // for performance reasons, do not search too deeply break; } wordProcessed = wordChecked.toCharArray(); wordLen = wordProcessed.length; if (wordLen < MIN_WORD_LENGTH && i > 2) { // three-letter replacements make little sense anyway break; } candidate = new char[MAX_WORD_LENGTH]; candLen = candidate.length; effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance; charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, MAX_WORD_LENGTH); byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, MAX_WORD_LENGTH); final byte[] prevBytes = new byte[0]; findRepl(candidates, 0, fsa.getRootNode(), prevBytes, 0, 0, -1, null, '\0'); } } Collections.sort(candidates); // Apply replacements, prune duplicates while preserving the candidate order. final Set words = new HashSet<>(); final ArrayList result = new ArrayList<>(candidates.size()); for (final CandidateData cd : candidates) { String replaced = DictionaryLookup.applyReplacements( cd.getWord(), dictionaryMetadata.getOutputConversionPairs()); // Add only the first occurrence of a given word. if (words.add(replaced) && !replaced.equals(word)) { result.add(new CandidateData(replaced, cd.origDistance)); } } return result; } private void findRepl( List candidates, final int depth, final int node, final byte[] prevBytes, final int wordIndex, final int candIndex, final int minLookbackWordIndex, final String lastAnyToOneSource, final char lastAnyToOneTarget) { int dist = 0; for (int arc = fsa.getFirstArc(node); arc != 0; arc = fsa.getNextArc(arc)) { byteBuffer = BufferUtils.clearAndEnsureCapacity(byteBuffer, prevBytes.length + 1); byteBuffer.put(prevBytes); byteBuffer.put(fsa.getArcLabel(arc)); final int bufPos = byteBuffer.position(); byteBuffer.flip(); charBuffer.clear(); decoder.reset(); final CoderResult c = decoder.decode(byteBuffer, charBuffer, true); if (c.isMalformed()) { // incomplete multi-byte sequence: accumulate bytes and descend final byte[] prev = new byte[bufPos]; byteBuffer.position(0); byteBuffer.get(prev); if (!fsa.isArcTerminal(arc)) { findRepl( candidates, depth, fsa.getEndNode(arc), prev, wordIndex, candIndex, minLookbackWordIndex, lastAnyToOneSource, lastAnyToOneTarget); // note: depth is not incremented } byteBuffer.clear(); } else if (!c.isError()) { // unmappable characters are silently discarded decoder.flush(charBuffer); charBuffer.flip(); candidate[candIndex] = charBuffer.get(); charBuffer.clear(); byteBuffer.clear(); int lengthReplacement; // replacement "any to two" if ((lengthReplacement = matchAnyToTwo( wordIndex, candIndex, minLookbackWordIndex, lastAnyToOneSource, lastAnyToOneTarget)) > 0) { // the replacement takes place at the end of the candidate if (isEndOfCandidate(arc, wordIndex) && (dist = hMatrix.get(depth - 1, depth - 1)) <= effectEditDistance) { if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2)) > 0) { // there are extra letters in the word after the replacement dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 2)); } if (dist <= effectEditDistance) { candidates.add(new CandidateData(String.valueOf(candidate, 0, candIndex + 1), dist)); } } if (isArcNotTerminal(arc, candIndex)) { int x = hMatrix.get(depth, depth); hMatrix.set(depth, depth, hMatrix.get(depth - 1, depth - 1)); findRepl( candidates, Math.max(0, depth), fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement - 1, candIndex + 1, minLookbackWordIndex, lastAnyToOneSource, lastAnyToOneTarget); hMatrix.set(depth, depth, x); } } // replacement "any to one" if ((lengthReplacement = matchAnyToOne(wordIndex, candIndex)) > 0) { // the replacement takes place at the end of the candidate if (isEndOfCandidate(arc, wordIndex) && (dist = hMatrix.get(depth, depth)) <= effectEditDistance) { if (Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1)) > 0) { // there are extra letters in the word after the replacement dist = dist + Math.abs(wordLen - 1 - (wordIndex + lengthReplacement - 1)); } if (dist <= effectEditDistance) { candidates.add(new CandidateData(String.valueOf(candidate, 0, candIndex + 1), dist)); } } if (isArcNotTerminal(arc, candIndex)) { String newAnyToOneSource = new String(wordProcessed, wordIndex, lengthReplacement); findRepl( candidates, depth, fsa.getEndNode(arc), new byte[0], wordIndex + lengthReplacement, candIndex + 1, wordIndex + lengthReplacement, newAnyToOneSource, candidate[candIndex]); } } // general if (cuted(depth, wordIndex, candIndex) <= effectEditDistance) { if ((isEndOfCandidate(arc, wordIndex)) && (dist = ed(wordLen - 1 - (wordIndex - depth), depth, wordLen - 1, candIndex)) <= effectEditDistance) { candidates.add(new CandidateData(String.valueOf(candidate, 0, candIndex + 1), dist)); } if (isArcNotTerminal(arc, candIndex)) { findRepl( candidates, depth + 1, fsa.getEndNode(arc), new byte[0], wordIndex + 1, candIndex + 1, minLookbackWordIndex, lastAnyToOneSource, lastAnyToOneTarget); } } } } } private boolean isArcNotTerminal(final int arc, final int candIndex) { return !fsa.isArcTerminal(arc) && !(containsSeparators && candidate[candIndex] == dictionaryMetadata.getSeparatorAsChar()); } private boolean isEndOfCandidate(final int arc, final int wordIndex) { return (fsa.isArcFinal(arc) || isBeforeSeparator(arc)) // candidate has proper length && (Math.abs(wordLen - 1 - (wordIndex)) <= effectEditDistance); } private boolean isBeforeSeparator(final int arc) { if (containsSeparators) { final int arc1 = fsa.getArc(fsa.getEndNode(arc), dictionaryMetadata.getSeparator()); return arc1 != 0 && !fsa.isArcTerminal(arc1); } return false; } /** * Calculates edit distance. * * @param i length of first word (here: misspelled) - 1; * @param j length of second word (here: candidate) - 1. * @param wordIndex (TODO: javadoc?) * @param candIndex (TODO: javadoc?) * @return Edit distance between the two words. Remarks: See Oflazer. */ public int ed(final int i, final int j, final int wordIndex, final int candIndex) { int result; int a, b, c; if (areEqual(wordProcessed[wordIndex], candidate[candIndex])) { // last characters are the same result = hMatrix.get(i, j); } else if (wordIndex > 0 && candIndex > 0 && wordProcessed[wordIndex] == candidate[candIndex - 1] && wordProcessed[wordIndex - 1] == candidate[candIndex]) { // last two characters are transposed a = hMatrix.get(i - 1, j - 1); // transposition, e.g. ababab, ababba b = hMatrix.get(i + 1, j); // deletion, e.g. abab, aba c = hMatrix.get(i, j + 1); // insertion e.g. aba, abab result = 1 + min(a, b, c); } else { // otherwise a = hMatrix.get(i, j); // replacement, e.g. ababa, ababb b = hMatrix.get(i + 1, j); // deletion, e.g. ab, a c = hMatrix.get(i, j + 1); // insertion e.g. a, ab result = 1 + min(a, b, c); } hMatrix.set(i + 1, j + 1, result); return result; } // by Jaume Ortola private boolean areEqual(final char x, final char y) { if (x == y) { return true; } if (dictionaryMetadata.getEquivalentChars() != null) { List chars = dictionaryMetadata.getEquivalentChars().get(x); if (chars != null && chars.contains(y)) { return true; } } if (dictionaryMetadata.isIgnoringDiacritics()) { String xn = Normalizer.normalize(Character.toString(x), Form.NFD); String yn = Normalizer.normalize(Character.toString(y), Form.NFD); if (xn.charAt(0) == yn.charAt(0)) { // avoid case conversion, if possible return true; } if (dictionaryMetadata.isConvertingCase()) { // again case conversion only when needed -- we // do not need String.lowercase because we only check // single characters, so a cheaper method is enough if (Character.isLetter(xn.charAt(0))) { boolean testNeeded = Character.isLowerCase(xn.charAt(0)) != Character.isLowerCase(yn.charAt(0)); if (testNeeded) { return Character.toLowerCase(xn.charAt(0)) == Character.toLowerCase(yn.charAt(0)); } } } return xn.charAt(0) == yn.charAt(0); } return false; } /** * Calculates cut-off edit distance. * * @param depth current length of candidates. * @param wordIndex (TODO: javadoc?) * @param candIndex (TODO: javadoc?) * @return Cut-off edit distance. Remarks: See Oflazer. */ public int cuted(final int depth, final int wordIndex, final int candIndex) { final int l = Math.max(0, depth - effectEditDistance); // min chars from word to consider - 1 final int u = Math.min( wordLen - 1 - (wordIndex - depth), depth + effectEditDistance); // max chars from word to // consider - 1 int minEd = effectEditDistance + 1; // what is to be computed int wi = wordIndex + l - depth; int d; for (int i = l; i <= u; i++, wi++) { if ((d = ed(i, depth, wi, candIndex)) < minEd) { minEd = d; } } return minEd; } // Match the last letter of the candidate against two or more letters of the word. private int matchAnyToOne(final int wordIndex, final int candIndex) { if (replacementsAnyToOne.containsKey(candidate[candIndex])) { for (final Pattern p : replacementsAnyToOne.get(candidate[candIndex])) { if (p.startAnchor && wordIndex != 0) continue; int i = 0; while (i < p.chars.length && (wordIndex + i) < wordLen && p.chars[i] == wordProcessed[wordIndex + i]) { i++; } if (i == p.chars.length) { if (p.endAnchor && wordIndex + i != wordLen) continue; return i; } } } return 0; } private int matchAnyToTwo( final int wordIndex, final int candIndex, final int minLookbackWordIndex, final String lastAnyToOneSource, final char lastAnyToOneTarget) { if (candIndex > 0 && candIndex < candidate.length && wordIndex > 0) { char[] twoChar = {candidate[candIndex - 1], candidate[candIndex]}; String sTwoChar = new String(twoChar); if (replacementsAnyToTwo.containsKey(sTwoChar)) { for (final Pattern p : replacementsAnyToTwo.get(sTwoChar)) { if (p.startAnchor && wordIndex - 1 != 0) continue; if (p.chars.length == 2 && wordIndex < wordLen && candidate[candIndex - 1] == wordProcessed[wordIndex - 1] && candidate[candIndex] == wordProcessed[wordIndex]) { return 0; // unnecessary replacements } int i = 0; while (i < p.chars.length && (wordIndex - 1 + i) < wordLen && p.chars[i] == wordProcessed[wordIndex - 1 + i]) { i++; } if (i == p.chars.length) { if (p.endAnchor && wordIndex - 1 + i != wordLen) continue; // Reject if this match directly reverses a previous anyToOne match at an overlapping // position if (wordIndex - 1 < minLookbackWordIndex && lastAnyToOneSource != null && p.chars.length == 1 && p.chars[0] == lastAnyToOneTarget && sTwoChar.equals(lastAnyToOneSource)) { continue; } return i; } } } } return 0; } private static int min(final int a, final int b, final int c) { return Math.min(a, Math.min(b, c)); } /** * Copy-paste of Character.isAlphabetic() (needed as we require only 1.6) * * @param codePoint The input character. * @return True if the character is a Unicode alphabetic character. */ static boolean isAlphabetic(final int codePoint) { return ((1 << Character.UPPERCASE_LETTER | 1 << Character.LOWERCASE_LETTER | 1 << Character.TITLECASE_LETTER | 1 << Character.MODIFIER_LETTER | 1 << Character.OTHER_LETTER | 1 << Character.LETTER_NUMBER) >> Character.getType(codePoint) & 1) != 0; } /** * Checks whether a string contains a digit. Used for ignoring words with numbers * * @param s Word to be checked. * @return True if there is a digit inside the word. */ static boolean containsNoDigit(final String s) { for (int k = 0; k < s.length(); k++) { if (Character.isDigit(s.charAt(k))) { return false; } } return true; } /** * Returns true if str is made up of all-uppercase characters (ignoring characters * for which no upper-/lowercase distinction exists). */ boolean isAllUppercase(final String str) { for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); if (Character.isLetter(c) && Character.isLowerCase(c)) { return false; } } return true; } /** * Returns true if str is made up of all-lowercase characters (ignoring characters * for which no upper-/lowercase distinction exists). */ boolean isNotAllLowercase(final String str) { for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); if (Character.isLetter(c) && !Character.isLowerCase(c)) { return true; } } return false; } /** * @param str input string */ boolean isNotCapitalizedWord(final String str) { if (isNotEmpty(str) && Character.isUpperCase(str.charAt(0))) { for (int i = 1; i < str.length(); i++) { char c = str.charAt(i); if (Character.isLetter(c) && !Character.isLowerCase(c)) { return true; } } return false; } return true; } /** * Helper method to replace calls to "".equals(). * * @param str String to check * @return true if string is empty OR null */ static boolean isNotEmpty(final String str) { return str != null && str.length() != 0; } /** * @param str input str * @return Returns true if str is MixedCase. */ boolean isMixedCase(final String str) { return !isAllUppercase(str) && isNotCapitalizedWord(str) && isNotAllLowercase(str); } /** * @param str The string to check. * @return Returns true if str is CamelCase. Note that German compounds with a dash (like * "Waschmaschinen-Test") are also considered camel case by this method. */ public boolean isCamelCase(final String str) { return isNotEmpty(str) && !isAllUppercase(str) && isNotCapitalizedWord(str) && Character.isUpperCase(str.charAt(0)) && (!(str.length() > 1) || Character.isLowerCase(str.charAt(1))) && isNotAllLowercase(str); } /** * Used to determine whether the dictionary supports case conversions. * * @return boolean value that answers this question in a deep and meaningful way. * @since 1.9 */ public boolean convertsCase() { return dictionaryMetadata.isConvertingCase(); } /** * @param str The string to find the replacements for. * @param fromIndex The index from which replacements are found. * @param level The recursion level. The search stops if level is > MAX_RECURSION_LEVEL. * @return A list of all possible replacements of a {#link str} given string */ public List getAllReplacements(final String str, final int fromIndex, final int level) { List replaced = new ArrayList<>(); if (level > MAX_RECURSION_LEVEL) { // Stop searching at some point replaced.add(str); return replaced; } StringBuilder sb = new StringBuilder(); sb.append(str); int index = MAX_WORD_LENGTH; String key = ""; int keyLength = 0; boolean found = false; // find first possible replacement after fromIndex position String strippedKeyForSelected = ""; for (final String auxKey : replacementsTheRest.keySet()) { boolean startAnchor = isStartAnchored(auxKey); boolean endAnchor = isEndAnchored(auxKey); String stripped = (startAnchor || endAnchor) ? stripAnchors(auxKey) : auxKey; int auxIndex; if (startAnchor && fromIndex > 0) { continue; // ^ anchor only valid from the beginning } else if (startAnchor) { auxIndex = sb.indexOf(stripped, 0) == 0 ? 0 : -1; } else if (endAnchor) { int expectedIndex = sb.length() - stripped.length(); auxIndex = (expectedIndex >= fromIndex && sb.indexOf(stripped, expectedIndex) == expectedIndex) ? expectedIndex : -1; } else { auxIndex = sb.indexOf(auxKey, fromIndex); } if (auxIndex > -1 && (auxIndex < index || (auxIndex == index && !(stripped.length() < keyLength)))) { // select the longest possible key index = auxIndex; key = auxKey; keyLength = stripped.length(); strippedKeyForSelected = stripped; } } if (index < MAX_WORD_LENGTH) { for (final String rep : replacementsTheRest.get(key)) { // start a branch without replacement (only once per key) if (!found) { replaced.addAll( getAllReplacements(str, index + strippedKeyForSelected.length(), level + 1)); found = true; } // avoid unnecessary replacements (ex. don't replace L by L·L when L·L already present) int ind = sb.indexOf(rep, fromIndex - rep.length() + 1); if (rep.length() > strippedKeyForSelected.length() && ind > -1 && (ind == index || ind == index - rep.length() + 1)) { continue; } // start a branch with replacement sb.replace(index, index + strippedKeyForSelected.length(), rep); replaced.addAll(getAllReplacements(sb.toString(), index + rep.length(), level + 1)); sb.setLength(0); sb.append(str); } } if (!found) { replaced.add(sb.toString()); } return replaced; } /** * Sets up the word and candidate. Used only to test the edit distance in JUnit tests. * * @param word the first word * @param candidate the second word used for edit distance calculation */ void setWordAndCandidate(final String word, final String candidate) { wordProcessed = word.toCharArray(); wordLen = wordProcessed.length; this.candidate = candidate.toCharArray(); candLen = this.candidate.length; effectEditDistance = wordLen <= editDistance ? wordLen - 1 : editDistance; } public final int getWordLen() { return wordLen; } public final int getCandLen() { return candLen; } public final int getEffectiveED() { return effectEditDistance; } /** * Used to sort candidates according to edit distance, and possibly according to their frequency * in the future. */ public final class CandidateData implements Comparable { private final String word; private final int origDistance; private final int distance; CandidateData(final String word, final int distance) { this.word = word; this.origDistance = distance; this.distance = distance * FREQ_RANGES + FREQ_RANGES - getFrequency(word) - 1; } public final String getWord() { return word; } public final int getDistance() { return distance; } @Override public int compareTo(final CandidateData cd) { // Assume no overflow. return Integer.compare(this.distance, cd.getDistance()); } @Override public String toString() { return word + '/' + distance; } } } ================================================ FILE: morfologik-speller/src/test/java/morfologik/speller/HMatrixTest.java ================================================ package morfologik.speller; import static org.junit.jupiter.api.Assertions.*; import org.junit.jupiter.api.Test; public class HMatrixTest { private static final int MAX_WORD_LENGTH = 120; @Test public void stressTestInit() { for (int i = 0; i < 10; i++) { // test if we don't get beyond array limits etc. HMatrix H = new HMatrix(i, MAX_WORD_LENGTH); assertEquals(0, H.get(1, 1)); } } } ================================================ FILE: morfologik-speller/src/test/java/morfologik/speller/SpellerTest.java ================================================ package morfologik.speller; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import morfologik.stemming.Dictionary; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; public class SpellerTest { private static Dictionary dictionary; @BeforeAll public static void setup() throws Exception { final URL url = SpellerTest.class.getResource("slownik.dict"); dictionary = Dictionary.read(url); } /* @Test public void testAbka() throws Exception { final Speller spell = new Speller(dictionary, 2); System.out.println("Replacements:"); for (String s : spell.findReplacements("abka")) { System.out.println(s); } } */ @Test public void testRunonWords() throws IOException { final Speller spell = new Speller(dictionary); Assertions.assertThat(spell.replaceRunOnWords("abaka")).isEmpty(); Assertions.assertThat(spell.replaceRunOnWords("abakaabace")).contains("abaka abace"); Assertions.assertThat(spell.replaceRunOnWords("Abakaabace")).contains("Abaka abace"); Assertions.assertThat(spell.replaceRunOnWords("AbakaAbace")).contains("Abaka Abace"); Assertions.assertThat(spell.replaceRunOnWords("abakaAbace")).contains("abaka Abace"); // Test on an morphological dictionary - should work as well final URL url1 = getClass().getResource("test-infix.dict"); final Speller spell1 = new Speller(Dictionary.read(url1)); assertTrue(spell1.replaceRunOnWords("Rzekunia").isEmpty()); assertTrue( spell1.replaceRunOnWords("RzekuniaRzeczypospolitej").contains("Rzekunia Rzeczypospolitej")); assertTrue( spell1.replaceRunOnWords("RzekuniaRze").isEmpty()); // Rze is not found but is a prefix final URL url2 = getClass().getResource("single-char-word.dict"); final Speller spell2 = new Speller(Dictionary.read(url2)); assertTrue(spell2.replaceRunOnWords("alot").contains("a lot")); assertTrue(spell2.replaceRunOnWords("Alot").contains("A lot")); assertTrue(spell2.replaceRunOnWords("ALot").contains("A Lot")); assertTrue(spell2.replaceRunOnWords("LotAmusement").contains("Lot Amusement")); // TODO? assertTrue(spell2.replaceRunOnWords("LOTAMUSEMENT").contains("LOT AMUSEMENT")); assertTrue(spell2.replaceRunOnWords("aalot").contains("aa lot")); assertTrue(spell2.replaceRunOnWords("aamusement").contains("a amusement")); assertTrue(spell2.replaceRunOnWords("clot").isEmpty()); assertTrue(spell2.replaceRunOnWords("foobar").isEmpty()); } @Test public void testIsInDictionary() throws IOException { // Test on an morphological dictionary, including separators final URL url1 = getClass().getResource("test-infix.dict"); final Speller spell1 = new Speller(Dictionary.read(url1)); assertTrue(spell1.isInDictionary("Rzekunia")); assertTrue(!spell1.isInDictionary("Rzekunia+")); assertTrue(!spell1.isInDictionary("Rzekunia+aaa")); // test UTF-8 dictionary final URL url = getClass().getResource("test-utf-spell.dict"); final Speller spell = new Speller(Dictionary.read(url)); assertTrue(spell.isInDictionary("jaźń")); assertTrue(spell.isInDictionary("zażółć")); assertTrue(spell.isInDictionary("żółwiową")); assertTrue(spell.isInDictionary("ćwikła")); assertTrue(spell.isInDictionary("Żebrowski")); assertTrue(spell.isInDictionary("Święto")); assertTrue(spell.isInDictionary("Świerczewski")); assertTrue(spell.isInDictionary("abc")); } @Test public void testFindReplacements() throws IOException { final Speller spell = new Speller(dictionary, 1); assertTrue(spell.findReplacements("abka").contains("abak")); // check if we get only dictionary words... List reps = spell.findReplacements("bak"); for (final String word : reps) { assertTrue(spell.isInDictionary(word)); } assertTrue( spell.findReplacements("abka~~").isEmpty()); // 2 characters more -> edit distance too large assertTrue(!spell.findReplacements("Rezkunia").contains("Rzekunia")); final URL url1 = getClass().getResource("test-infix.dict"); final Speller spell1 = new Speller(Dictionary.read(url1)); assertTrue(spell1.findReplacements("Rezkunia").contains("Rzekunia")); // diacritics assertTrue(spell1.findReplacements("Rzękunia").contains("Rzekunia")); // we should get no candidates for correct words assertTrue(spell1.isInDictionary("Rzekunia")); assertTrue(spell1.findReplacements("Rzekunia").isEmpty()); // and no for things that are too different from the dictionary assertTrue(spell1.findReplacements("Strefakibica").isEmpty()); // nothing for nothing assertTrue(spell1.findReplacements("").isEmpty()); // nothing for weird characters assertTrue(spell1.findReplacements("\u0000").isEmpty()); // nothing for other characters assertTrue(spell1.findReplacements("«…»").isEmpty()); // nothing for separator assertTrue(spell1.findReplacements("+").isEmpty()); } @Test public void testFrequencyNonUTFDictionary() throws IOException { final URL url1 = getClass().getResource("test_freq_iso.dict"); final Speller spell = new Speller(Dictionary.read(url1)); assertTrue(spell.isInDictionary("a")); assertTrue(!spell.isInDictionary("aõh")); // non-encodable in UTF-8 } @Test public void testFindReplacementsInUTF() throws IOException { final URL url = getClass().getResource("test-utf-spell.dict"); final Speller spell = new Speller(Dictionary.read(url)); assertTrue(spell.findReplacements("gęslą").contains("gęślą")); assertTrue(spell.findReplacements("ćwikla").contains("ćwikła")); assertTrue(spell.findReplacements("Swierczewski").contains("Świerczewski")); assertTrue(spell.findReplacements("zółwiową").contains("żółwiową")); assertTrue(spell.findReplacements("Żebrowsk").contains("Żebrowski")); assertTrue(spell.findReplacements("święto").contains("Święto")); // note: no diacritics here, but we still get matches! assertTrue(spell.findReplacements("gesla").contains("gęślą")); assertTrue(spell.findReplacements("swieto").contains("Święto")); assertTrue(spell.findReplacements("zolwiowa").contains("żółwiową")); // using equivalent characters 'x' = 'ź' assertTrue(spell.findReplacements("jexn").contains("jaźń")); // 'u' = 'ó', so the edit distance is still small... assertTrue(spell.findReplacements("zażulv").contains("zażółć")); // 'rz' = 'ż', so the edit distance is still small, but with string replacements... assertTrue(spell.findReplacements("zarzulv").contains("zażółć")); assertTrue(spell.findReplacements("Rzebrowski").contains("Żebrowski")); assertTrue(spell.findReplacements("rzółw").contains("żółw")); assertTrue(spell.findReplacements("Świento").contains("Święto")); // avoid mixed-case words as suggestions when using replacements ('rz' = 'ż') assertTrue(spell.findReplacements("zArzółć").get(0).equals("zażółć")); } @Test public void testFindReplacementsUsingFrequency() throws IOException { final URL url = getClass().getResource("dict-with-freq.dict"); final Speller spell = new Speller(Dictionary.read(url)); // check if we get only dictionary words... List reps = spell.findReplacements("jist"); for (final String word : reps) { assertTrue(spell.isInDictionary(word)); } // get replacements ordered by frequency assertTrue(reps.get(0).equals("just")); assertTrue(reps.get(1).equals("list")); assertTrue(reps.get(2).equals("fist")); assertTrue(reps.get(3).equals("mist")); assertTrue(reps.get(4).equals("jest")); assertTrue(reps.get(5).equals("dist")); assertTrue(reps.get(6).equals("gist")); } @Test public void testFindSimilarWords() throws IOException { final URL url = getClass().getResource("dict-with-freq.dict"); final Speller spell = new Speller(Dictionary.read(url)); List reps = spell.findSimilarWords("fist"); assertTrue(reps.toString().equals("[list, mist, dist, gist, wist, hist]")); reps = spell.findSimilarWords("mist"); assertTrue(reps.toString().equals("[list, fist, dist, gist, wist, hist]")); reps = spell.findSimilarWords("Fist"); assertTrue(reps.toString().equals("[fist, list, mist, dist, gist, wist, hist]")); reps = spell.findSimilarWords("licit"); assertTrue(reps.toString().equals("[list, fist, mist, dist, gist, wist, hist]")); } @Test public void testConcurrentReplacements() throws IOException { final URL url = getClass().getResource("dict-with-freq.dict"); final Speller spell = new Speller(Dictionary.read(url)); // only the longest key is selected in replacement pairs List reps = spell.getAllReplacements("teached", 0, 0); assertTrue(reps.contains("teached")); assertTrue(reps.contains("taught")); assertTrue(!reps.contains("tgheached")); } @Test public void testIsMisspelled() throws IOException { final URL url = getClass().getResource("test-utf-spell.dict"); final Speller spell = new Speller(Dictionary.read(url)); assertTrue(!spell.isMisspelled("Paragraf22")); // ignorujemy liczby assertTrue(!spell.isMisspelled("!")); // ignorujemy znaki przestankowe assertTrue(spell.isMisspelled("dziekie")); // test, czy znajdujemy błąd assertTrue(!spell.isMisspelled("SłowozGarbem")); // ignorujemy słowa w stylu wielbłąda assertTrue(!spell.isMisspelled("Ćwikła")); // i małe litery assertTrue(!spell.isMisspelled("TOJESTTEST")); // i wielkie litery final Speller oldStyleSpell = new Speller(dictionary, 1); assertTrue(oldStyleSpell.isMisspelled("Paragraf22")); // nie ignorujemy liczby assertTrue(oldStyleSpell.isMisspelled("!")); // nie ignorujemy znaków przestankowych // assertTrue(oldStyleSpell.isMisspelled("SłowozGarbem")); //ignorujemy słowa w stylu wielbłąda assertTrue(oldStyleSpell.isMisspelled("Abaka")); // i małe litery final URL url1 = getClass().getResource("test-infix.dict"); final Speller spell1 = new Speller(Dictionary.read(url1)); assertTrue(!spell1.isMisspelled("Rzekunia")); assertTrue(spell1.isAllUppercase("RZEKUNIA")); assertTrue(spell1.isMisspelled("RZEKUNIAA")); // finds a typo here assertTrue(!spell1.isMisspelled("RZEKUNIA")); // but not here } @Test public void testCamelCase() { final Speller spell = new Speller(dictionary, 1); assertTrue(spell.isCamelCase("CamelCase")); assertTrue(!spell.isCamelCase("Camel")); assertTrue(!spell.isCamelCase("CAMEL")); assertTrue(!spell.isCamelCase("camel")); assertTrue(!spell.isCamelCase("cAmel")); assertTrue(!spell.isCamelCase("CAmel")); assertTrue(!spell.isCamelCase("")); assertTrue(!spell.isCamelCase(null)); } @Test public void testCapitalizedWord() { final Speller spell = new Speller(dictionary, 1); assertTrue(spell.isNotCapitalizedWord("CamelCase")); assertTrue(!spell.isNotCapitalizedWord("Camel")); assertTrue(spell.isNotCapitalizedWord("CAMEL")); assertTrue(spell.isNotCapitalizedWord("camel")); assertTrue(spell.isNotCapitalizedWord("cAmel")); assertTrue(spell.isNotCapitalizedWord("CAmel")); assertTrue(spell.isNotCapitalizedWord("")); } @Test public void testGetAllReplacements() throws IOException { final URL url = getClass().getResource("test-utf-spell.dict"); final Speller spell = new Speller(Dictionary.read(url)); assertTrue(spell.isMisspelled("rzarzerzarzu")); assertEquals( "[rzarzerzarzu]", Arrays.toString(spell.getAllReplacements("rzarzerzarzu", 0, 0).toArray())); } @Test public void testEditDistanceCalculation() throws IOException { final Speller spell = new Speller(dictionary, 5); // test examples from Oflazer's paper assertTrue(getEditDistance(spell, "recoginze", "recognize") == 1); assertTrue(getEditDistance(spell, "sailn", "failing") == 3); assertTrue(getEditDistance(spell, "abc", "abcd") == 1); assertTrue(getEditDistance(spell, "abc", "abcde") == 2); // test words from fsa_spell output assertTrue(getEditDistance(spell, "abka", "abaka") == 1); assertTrue(getEditDistance(spell, "abka", "abakan") == 2); assertTrue(getEditDistance(spell, "abka", "abaką") == 2); assertTrue(getEditDistance(spell, "abka", "abaki") == 2); } @Test public void testCutOffEditDistance() throws IOException { final Speller spell2 = new Speller(dictionary, 2); // note: threshold = 2 // test cut edit distance - reprter / repo from Oflazer assertTrue(getCutOffDistance(spell2, "repo", "reprter") == 1); assertTrue(getCutOffDistance(spell2, "reporter", "reporter") == 0); } @Test public void testReplacementsAndDistance2() throws Exception { /*File infoFile = new File("/tmp/morfologik.info"); FileWriter fw1 = new FileWriter(infoFile); fw1.write("fsa.dict.separator=+\n"); fw1.write("fsa.dict.encoding=utf-8\n"); fw1.write("fsa.dict.speller.replacement-pairs=s ss,t d,R Rh,y ij,ę em,em ę\n"); fw1.close(); File inputFile = new File("/tmp/morfologik.txt"); FileWriter fw2 = new FileWriter(inputFile); fw2.write("Mitmuss\n"); fw2.write("Rhythmus\n"); fw2.write("Wald\n"); fw2.write("Band\n"); fw2.write("ijo\n"); fw2.write("ijond\n"); fw2.write("youd\n"); fw2.write("ijoussud\n"); fw2.write("ijoussuud\n"); fw2.write("ijussuud\n"); fw2.write("ijousod\n"); fw2.write("ij\n"); fw2.write("ijo\n"); fw2.write("Ciarkę\n"); fw2.write("Czarkę\n"); fw2.write("Clarke\n"); fw2.write("Clarkiem\n"); fw2.write("Clarkom\n"); fw2.close(); File dictFile = new File("/tmp/morfologik.dict"); String[] buildToolOptions = {"-i", inputFile.getAbsolutePath(), "-o", dictFile.getAbsolutePath()}; FSABuildTool.main(buildToolOptions); Dictionary dictionary = Dictionary.read(dictFile); Speller speller = new Speller(dictionary, 3);*/ final URL url = getClass().getResource("reps_dist2.dict"); final Speller speller = new Speller(Dictionary.read(url), 3); List reps = speller.findReplacements("Rytmus"); assertTrue(reps.get(0).equals("Rhythmus")); assertTrue(reps.get(1).equals("Mitmuss")); reps = speller.findReplacements("Walt"); assertTrue(reps.get(0).equals("Wald")); assertTrue(reps.get(1).equals("Band")); reps = speller.findReplacements("yout"); assertTrue(reps.get(0).equals("youd")); assertTrue(reps.get(1).equals("ijond")); assertTrue(reps.get(2).equals("ijo")); reps = speller.findReplacements("yousut"); assertTrue(reps.get(0).equals("ijoussud")); assertTrue(reps.get(1).equals("ijousod")); assertTrue(reps.get(2).equals("ijoussuud")); assertTrue(reps.get(3).equals("youd")); reps = speller.findReplacements("yo"); assertTrue(reps.get(0).equals("ijo")); assertTrue(reps.get(1).equals("ij")); reps = speller.findReplacements("Clarkem"); assertTrue(reps.get(0).equals("Ciarkę")); assertTrue(reps.get(1).equals("Clarke")); assertTrue(reps.get(2).equals("Clarkiem")); assertTrue(reps.get(3).equals("Clarkom")); assertTrue(reps.get(4).equals("Czarkę")); } @Test public void testFindReplacementsConsistentAcrossRepeatedCalls() throws IOException { // HMatrix must be reset at the start of each findReplacementCandidates call. // Without the reset, stale edit-distance values left by a previous traversal // corrupt results: a reused Speller returns different candidates than a // freshly constructed one. final List expected = new Speller(dictionary, 3).findReplacements("bak"); final Speller reused = new Speller(dictionary, 3); reused.findReplacements("abka"); // dirties the hMatrix final List actual = reused.findReplacements("bak"); assertEquals(expected, actual); } @Test public void testIssue38AnchoredReplacementPairs() throws Exception { // GH-38: support for ^ (start), $ (end) anchors and _ (space) in replacement-pairs. // editDistance=0 ensures candidates are only found via replacement pairs, not by // coincidental edit distance (e.g. "alot"/"a lot" differ by just 1). final URL url = getClass().getResource("issue38.dict"); final Speller speller = new Speller(Dictionary.read(url), 0); // ^Ij IJ: start-anchored 2-char replacement; "Ijsland" -> "IJsland" assertTrue(speller.findReplacements("Ijsland").contains("IJsland")); // ^alot a_lot: start-anchored replacement with _ as space; "alot" -> "a lot" assertTrue(speller.findReplacements("alot").contains("a lot")); // ^påny$ på_ny: both anchors + _ as space; whole-word replacement "påny" -> "på ny" assertTrue(speller.findReplacements("påny").contains("på ny")); } @Test public void testIssue94() throws Exception { final URL url = getClass().getResource("issue94.dict"); final Speller speller = new Speller(Dictionary.read(url)); List reps = speller.findReplacements("schänken"); assertTrue(reps.get(0).equals("Schänken")); assertTrue(reps.get(1).equals("schenken")); } @Test public void testReciprocalReplacementPairsDoNotProduceZeroDistance() throws IOException { // Searching for "pissara" in a dictionary containing "pissarra", "passara", "passarà". // With reciprocal replacement pairs ss↔s, the bug causes matchAnyToOne (ss→s) followed by // matchAnyToTwo (s→ss) to double-consume word[3]='s', corrupting the HMatrix and making // "passara"/"passarà" appear as distance=0 candidates instead of distance=1. final URL url = getClass().getResource("pissara-test.dict"); final Speller speller = new Speller(Dictionary.read(url), 2); List candidates = speller.findReplacementCandidates("pissara"); // "pissarra" (one extra 'r') and "passara" (i→a, ss→s) are both valid distance-1 candidates List words = new ArrayList<>(); for (Speller.CandidateData cd : candidates) { words.add(cd.getWord()); } assertTrue(words.contains("pissarra"), "pissarra should be a suggestion for pissara"); assertTrue(words.contains("passara"), "passara should be a suggestion for pissara"); assertTrue(words.contains("passarà"), "passara should be a suggestion for pissara"); // No candidate should have origDistance=0: that would indicate the double-consumption bug. // With FREQ_RANGES=26 and freq=0: origDistance=0 → distance=25, origDistance=1 → distance=51. for (Speller.CandidateData cd : candidates) { int origDistance = cd.getDistance() / Speller.FREQ_RANGES; assertTrue( origDistance > 0, "Candidate '" + cd.getWord() + "' has unexpected origDistance=0"); } } private int getCutOffDistance(final Speller spell, final String word, final String candidate) { // assuming there is no pair-replacement spell.setWordAndCandidate(word, candidate); final int[] ced = new int[spell.getCandLen() - spell.getWordLen()]; for (int i = 0; i < spell.getCandLen() - spell.getWordLen(); i++) { ced[i] = spell.cuted(spell.getWordLen() + i, spell.getWordLen() + i, spell.getWordLen() + i); } Arrays.sort(ced); // and the min value... if (ced.length > 0) { return ced[0]; } return 0; } private int getEditDistance(final Speller spell, final String word, final String candidate) { // assuming there is no pair-replacement spell.setWordAndCandidate(word, candidate); final int maxDistance = spell.getEffectiveED(); final int candidateLen = spell.getCandLen(); final int wordLen = spell.getWordLen(); int ed = 0; for (int i = 0; i < candidateLen; i++) { if (spell.cuted(i, i, i) <= maxDistance) { if (Math.abs(wordLen - 1 - i) <= maxDistance) { ed = spell.ed(wordLen - 1, i, wordLen - 1, i); } } } return ed; } } ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=iso-8859-2 fsa.dict.encoder=suffix fsa.dict.frequency-included=true fsa.dict.speller.locale=en_US fsa.dict.speller.ignore-diacritics=true fsa.dict.speller.replacement-pairs=ninties 1990s, teached taught, t tgh, rised rose, a ei, ei a, a ey, ey a, ai ie, ie ai, are air, are ear, are eir, air are, air ere, ere air, ere ear, ere eir, ear are, ear air, ear ere, eir are, eir ere, ch te, te ch, ch ti, ti ch, ch tu, tu ch, ch s, s ch, ch k, k ch, f ph, ph f, gh f, f gh, i igh, igh i, i uy, uy i, i ee, ee i, j di, di j, j gg, gg j, j ge, ge j, s ti, ti s, s ci, ci s, k cc, cc k, k qu, qu k, kw qu, o eau, eau o, o ew, ew o, oo ew, ew oo, ew ui, ui ew, oo ui, ui oo, ew u, u ew, oo u, u oo, u oe, oe u, u ieu, ieu u, ue ew, ew ue, uff ough, oo ieu, ieu oo, ier ear, ear ier, ear air, air ear, w qu, qu w, z ss, ss z, shun tion, shun sion, shun cion ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/dict-with-freq.txt ================================================ ageist+C deist+G didst+A digest+J direst+E dist+G divest+I fist+J gist+G grist+I heist+I hist+A jest+H jilt+D joist+F just+P licit+F list+O mist+J weest+A wist+C ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/issue38.info ================================================ fsa.dict.separator=+ fsa.dict.encoding=utf-8 fsa.dict.encoder=suffix fsa.dict.speller.replacement-pairs=^Ij IJ,^alot a_lot,^påny$ på_ny ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/issue38.input ================================================ IJsland+IJsland a lot+a lot på ny+på ny ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/issue94.info ================================================ fsa.dict.speller.replacement-pairs=ä e fsa.dict.encoder=SUFFIX fsa.dict.separator=+ fsa.dict.encoding=utf-8 fsa.dict.speller.ignore-diacritics=false ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/pissara-test.info ================================================ fsa.dict.separator=+ fsa.dict.encoding=utf-8 fsa.dict.encoder=NONE fsa.dict.speller.replacement-pairs=s ss,ss s ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/pissara-test.txt ================================================ passara passarà pissarra ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/reps_dist2.info ================================================ fsa.dict.separator=+ fsa.dict.encoding=utf-8 fsa.dict.speller.replacement-pairs=s ss,t d,R Rh,y ij,ę em,em ę fsa.dict.encoder=suffix ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/reps_dist2.txt ================================================ Mitmuss Rhythmus Wald Band ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/single-char-word.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=Cp1250 fsa.dict.encoder=suffix fsa.dict.speller.ignore-diacritics=false fsa.dict.speller.ignore-numbers=false fsa.dict.speller.convert-case=false fsa.dict.speller.ignore-punctuation=false ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/slownik.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=Cp1250 fsa.dict.encoder=suffix fsa.dict.speller.ignore-diacritics=false fsa.dict.speller.ignore-numbers=false fsa.dict.speller.convert-case=false fsa.dict.speller.ignore-punctuation=false ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/test-infix.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=iso-8859-2 fsa.dict.encoder=infix fsa.dict.speller.ignore-all-uppercase=false ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/test-utf-spell.info ================================================ # # Dictionary properties. # UTF-8 encoding or native2ascii has to be used for non-ASCII data. # fsa.dict.separator=+ fsa.dict.encoding=utf-8 fsa.dict.encoder=suffix fsa.dict.speller.locale=pl_PL fsa.dict.speller.ignore-diacritics=true fsa.dict.speller.equivalent-chars=x ź, l ł, u ó, ó u fsa.dict.speller.replacement-pairs=rz ż, ż rz, ch h, h ch, ę en, en ę ================================================ FILE: morfologik-speller/src/test/resources/morfologik/speller/test_freq_iso.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=iso-8859-2 fsa.dict.encoder=suffix fsa.dict.frequency-included=true fsa.dict.speller.locale=pl_PL fsa.dict.speller.ignore-diacritics=true fsa.dict.speller.equivalent-chars=x ź, l ł, u ó, ó u fsa.dict.speller.replacement-pairs=ź zi, ł eu, ć ci, ć dż, ć dź, ć dz, c dz, ch h, ci ć, cz czy, dź ć, dź dzi, dż ć, dz ć, dzi dź, edzil ędził, ę em, ę en, ei eja, eja ei, em ę, en ę, eu ł, h ch, he chę, śi ś, ii ija, ija ii, iosc ość, ise się, loz łos, ni ń, ńi ń, ń ni, ą oł, oł ą, oi oja, oja oi, ą om, om ą, ą on, on ą, ru kró, ż rz, rz ż, rz sz, scia ścią, ś si, si ś, sić ść, s sną, sz ż, sz rz, tro rot, u y, wu wy, yi yja, yja yi, zal rzał, zekac rzekać, zi ź, zl azł, z żn, z rz, chłopcowi chłopcu, bratowi bratu, aleji alei, lubieć lubić, nei nie, źmie zmie, piatek piątek, pokuj pokój, poszłem poszedłem, prosze proszę, rząda żąda, sa są, sei się, standart standard, trzcionk czcionk, szłem szedłem, pry przy ================================================ FILE: morfologik-stemming/pom.xml ================================================ 4.0.0 org.carrot2 morfologik-parent 2.2.0-SNAPSHOT ../pom.xml morfologik-stemming bundle Morfologik Stemming APIs Morfologik Stemming APIs. ../etc/forbidden-apis/signatures.txt org.carrot2.morfologik.stemming org.carrot2 morfologik-fsa ${project.version} org.apache.felix maven-bundle-plugin morfologik.stemming * ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/ArrayViewList.java ================================================ package morfologik.stemming; import java.util.*; /** A view over a range of an array. */ @SuppressWarnings("serial") final class ArrayViewList extends AbstractList implements RandomAccess, java.io.Serializable { /** Backing array. */ private E[] a; private int start; private int length; /* * */ ArrayViewList(E[] array, int start, int length) { if (array == null) throw new IllegalArgumentException(); wrap(array, start, length); } /* * */ public int size() { return length; } /* * */ public E get(int index) { return a[start + index]; } /* * */ public E set(int index, E element) { throw new UnsupportedOperationException(); } /* * */ public void add(int index, E element) { throw new UnsupportedOperationException(); } /* * */ public E remove(int index) { throw new UnsupportedOperationException(); } /* * */ public boolean addAll(int index, Collection c) { throw new UnsupportedOperationException(); } /* * */ public int indexOf(Object o) { if (o == null) { for (int i = start; i < start + length; i++) if (a[i] == null) return i - start; } else { for (int i = start; i < start + length; i++) if (o.equals(a[i])) return i - start; } return -1; } public ListIterator listIterator() { return listIterator(0); } /* * */ public ListIterator listIterator(final int index) { return Arrays.asList(a).subList(start, start + length).listIterator(index); } /* * */ public boolean contains(Object o) { return indexOf(o) != -1; } /* * */ void wrap(E[] array, int start, int length) { this.a = array; this.start = start; this.length = length; } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/BufferUtils.java ================================================ package morfologik.stemming; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; import java.util.Arrays; public final class BufferUtils { /** No instances. */ private BufferUtils() { // empty } /** * Ensure the buffer's capacity is large enough to hold a given number of elements. If the input * buffer is not large enough, a new buffer is allocated and returned. * * @param elements The required number of elements to be appended to the buffer. * @param buffer The buffer to check or null if a new buffer should be allocated. * @return Returns the same buffer or a new buffer with the given capacity. */ public static ByteBuffer clearAndEnsureCapacity(ByteBuffer buffer, int elements) { if (buffer == null || buffer.capacity() < elements) { buffer = ByteBuffer.allocate(elements); } else { buffer.clear(); } return buffer; } /** * Ensure the buffer's capacity is large enough to hold a given number of elements. If the input * buffer is not large enough, a new buffer is allocated and returned. * * @param elements The required number of elements to be appended to the buffer. * @param buffer The buffer to check or null if a new buffer should be allocated. * @return Returns the same buffer or a new buffer with the given capacity. */ public static CharBuffer clearAndEnsureCapacity(CharBuffer buffer, int elements) { if (buffer == null || buffer.capacity() < elements) { buffer = CharBuffer.allocate(elements); } else { buffer.clear(); } return buffer; } /** * @param buffer The buffer to convert to a string. * @param charset The charset to use when converting bytes to characters. * @return A string representation of buffer's content. */ public static String toString(ByteBuffer buffer, Charset charset) { buffer = buffer.slice(); byte[] buf = new byte[buffer.remaining()]; buffer.get(buf); return new String(buf, charset); } public static String toString(CharBuffer buffer) { buffer = buffer.slice(); char[] buf = new char[buffer.remaining()]; buffer.get(buf); return new String(buf); } /** * @param buffer The buffer to read from. * @return Returns the remaining bytes from the buffer copied to an array. */ public static byte[] toArray(ByteBuffer buffer) { byte[] dst = new byte[buffer.remaining()]; buffer.mark(); buffer.get(dst); buffer.reset(); return dst; } /** Compute the length of the shared prefix between two byte sequences. */ static int sharedPrefixLength(ByteBuffer a, int aStart, ByteBuffer b, int bStart) { int i = 0; final int max = Math.min(a.remaining() - aStart, b.remaining() - bStart); aStart += a.position(); bStart += b.position(); while (i < max && a.get(aStart++) == b.get(bStart++)) { i++; } return i; } /** Compute the length of the shared prefix between two byte sequences. */ static int sharedPrefixLength(ByteBuffer a, ByteBuffer b) { return sharedPrefixLength(a, 0, b, 0); } /** * Convert byte buffer's content into characters. The input buffer's bytes are not consumed (mark * is set and reset). */ public static CharBuffer bytesToChars( CharsetDecoder decoder, ByteBuffer bytes, CharBuffer chars) { assert decoder.malformedInputAction() == CodingErrorAction.REPORT; chars = clearAndEnsureCapacity(chars, (int) (bytes.remaining() * decoder.maxCharsPerByte())); bytes.mark(); decoder.reset(); CoderResult cr = decoder.decode(bytes, chars, true); if (cr.isError()) { bytes.reset(); try { cr.throwException(); } catch (CharacterCodingException e) { throw new RuntimeException( "Input cannot be mapped to bytes using encoding " + decoder.charset().name() + ": " + Arrays.toString(toArray(bytes)), e); } } assert cr.isUnderflow(); // This should be guaranteed by ensuring max. capacity. cr = decoder.flush(chars); assert cr.isUnderflow(); chars.flip(); bytes.reset(); return chars; } /** Convert chars into bytes. */ public static ByteBuffer charsToBytes(CharsetEncoder encoder, CharBuffer chars, ByteBuffer bytes) throws UnmappableInputException { assert encoder.malformedInputAction() == CodingErrorAction.REPORT; bytes = clearAndEnsureCapacity(bytes, (int) (chars.remaining() * encoder.maxBytesPerChar())); chars.mark(); encoder.reset(); CoderResult cr = encoder.encode(chars, bytes, true); if (cr.isError()) { chars.reset(); try { cr.throwException(); } catch (CharacterCodingException e) { throw new UnmappableInputException( "Input cannot be mapped to characters using encoding " + encoder.charset().name() + ": " + Arrays.toString(toArray(bytes)), e); } } assert cr.isUnderflow(); // This should be guaranteed by ensuring max. capacity. cr = encoder.flush(bytes); assert cr.isUnderflow(); bytes.flip(); chars.reset(); return bytes; } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/Dictionary.java ================================================ package morfologik.stemming; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import morfologik.fsa.FSA; /** * A dictionary combines {@link FSA} automaton and {@link DictionaryMetadata} describing the way * terms are encoded in the automaton. * *

A dictionary consists of two files: * *

    *
  • an actual compressed FSA file, *
  • {@link DictionaryMetadata}, describing the way terms are encoded. *
*/ public final class Dictionary { /** {@link FSA} automaton with the compiled dictionary data. */ public final FSA fsa; /** Metadata associated with the dictionary. */ public final DictionaryMetadata metadata; /** * It is strongly recommended to use static methods in this class for reading dictionaries. * * @param fsa An instantiated {@link FSA} instance. * @param metadata A map of attributes describing the compression format and other settings not * contained in the FSA automaton. For an explanation of available attributes and their * possible values, see {@link DictionaryMetadata}. */ public Dictionary(FSA fsa, DictionaryMetadata metadata) { this.fsa = fsa; this.metadata = metadata; } /** * Attempts to load a dictionary using the path to the FSA file and the expected metadata * extension. * * @param location The location of the dictionary file (*.dict). * @return An instantiated dictionary. * @throws IOException if an I/O error occurs. */ public static Dictionary read(Path location) throws IOException { final Path metadata = DictionaryMetadata.getExpectedMetadataLocation(location); try (InputStream fsaStream = Files.newInputStream(location); InputStream metadataStream = Files.newInputStream(metadata)) { return read(fsaStream, metadataStream); } } /** * Attempts to load a dictionary using the URL to the FSA file and the expected metadata * extension. * * @param dictURL The URL pointing to the dictionary file (*.dict). * @return An instantiated dictionary. * @throws IOException if an I/O error occurs. */ public static Dictionary read(URL dictURL) throws IOException { final URL expectedMetadataURL; try { String external = dictURL.toExternalForm(); expectedMetadataURL = new URL(DictionaryMetadata.getExpectedMetadataFileName(external)); } catch (MalformedURLException e) { throw new IOException("Couldn't construct relative feature map URL for: " + dictURL, e); } try (InputStream fsaStream = dictURL.openStream(); InputStream metadataStream = expectedMetadataURL.openStream()) { return read(fsaStream, metadataStream); } } /** * Attempts to load a dictionary from opened streams of FSA dictionary data and associated * metadata. Input streams are not closed automatically. * * @param fsaStream The stream with FSA data * @param metadataStream The stream with metadata * @return Returns an instantiated {@link Dictionary}. * @throws IOException if an I/O error occurs. */ public static Dictionary read(InputStream fsaStream, InputStream metadataStream) throws IOException { return new Dictionary(FSA.read(fsaStream), DictionaryMetadata.read(metadataStream)); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/DictionaryAttribute.java ================================================ package morfologik.stemming; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; /** Attributes applying to {@link Dictionary} and {@link DictionaryMetadata}. */ public enum DictionaryAttribute { /** Logical fields separator inside the FSA. */ SEPARATOR("fsa.dict.separator") { @Override public Character fromString(String separator) { if (separator == null || separator.length() != 1) { throw new IllegalArgumentException( "Attribute " + propertyName + " must be a single character."); } char charValue = separator.charAt(0); if (Character.isHighSurrogate(charValue) || Character.isLowSurrogate(charValue)) { throw new IllegalArgumentException( "Field separator character cannot be part of a surrogate pair: " + separator); } return charValue; } }, /** Character to byte encoding used for strings inside the FSA. */ ENCODING("fsa.dict.encoding") { @Override public Charset fromString(String charsetName) { return Charset.forName(charsetName); } }, /** If the FSA dictionary includes frequency data. */ FREQUENCY_INCLUDED("fsa.dict.frequency-included") { @Override public Boolean fromString(String value) { return booleanValue(value); } }, /** If the spelling dictionary is supposed to ignore words containing digits */ IGNORE_NUMBERS("fsa.dict.speller.ignore-numbers") { @Override public Boolean fromString(String value) { return booleanValue(value); } }, /** If the spelling dictionary is supposed to ignore punctuation. */ IGNORE_PUNCTUATION("fsa.dict.speller.ignore-punctuation") { @Override public Boolean fromString(String value) { return booleanValue(value); } }, /** If the spelling dictionary is supposed to ignore CamelCase words. */ IGNORE_CAMEL_CASE("fsa.dict.speller.ignore-camel-case") { @Override public Boolean fromString(String value) { return booleanValue(value); } }, /** If the spelling dictionary is supposed to ignore ALL UPPERCASE words. */ IGNORE_ALL_UPPERCASE("fsa.dict.speller.ignore-all-uppercase") { @Override public Boolean fromString(String value) { return booleanValue(value); } }, /** * If the spelling dictionary is supposed to ignore diacritics, so that 'a' would be treated as * equivalent to 'ą'. */ IGNORE_DIACRITICS("fsa.dict.speller.ignore-diacritics") { @Override public Boolean fromString(String value) { return booleanValue(value); } }, /** if the spelling dictionary is supposed to treat upper and lower case as equivalent. */ CONVERT_CASE("fsa.dict.speller.convert-case") { @Override public Boolean fromString(String value) { return booleanValue(value); } }, /** If the spelling dictionary is supposed to split runOnWords. */ RUN_ON_WORDS("fsa.dict.speller.runon-words") { @Override public Boolean fromString(String value) { return booleanValue(value); } }, /** Locale associated with the dictionary. */ LOCALE("fsa.dict.speller.locale") { @Override public Locale fromString(String value) { return new Locale(value); } }, /** Locale associated with the dictionary. */ ENCODER("fsa.dict.encoder") { @Override public EncoderType fromString(String value) { try { return EncoderType.valueOf(value.trim().toUpperCase(Locale.ROOT)); } catch (IllegalArgumentException e) { throw new IllegalArgumentException( "Invalid encoder name '" + value.trim() + "', only these coders are valid: " + Arrays.toString(EncoderType.values())); } } }, /** * Input conversion pairs to replace non-standard characters before search in a speller * dictionary. For example, common ligatures can be replaced here. */ INPUT_CONVERSION("fsa.dict.input-conversion") { @Override public LinkedHashMap fromString(String value) throws IllegalArgumentException { LinkedHashMap conversionPairs = new LinkedHashMap<>(); final String[] replacements = value.split(",\\s*"); for (final String stringPair : replacements) { final String[] twoStrings = stringPair.trim().split(" "); if (twoStrings.length == 2) { if (!conversionPairs.containsKey(twoStrings[0])) { conversionPairs.put(twoStrings[0], twoStrings[1]); } else { throw new IllegalArgumentException( "Input conversion cannot specify different values for the same input string: " + twoStrings[0]); } } else { throw new IllegalArgumentException( "Attribute " + propertyName + " is not in the proper format: " + value); } } return conversionPairs; } }, /** * Output conversion pairs to replace non-standard characters before search in a speller * dictionary. For example, standard characters can be replaced here into ligatures. * *

Useful for dictionaries that do have certain standards imposed. */ OUTPUT_CONVERSION("fsa.dict.output-conversion") { @Override public LinkedHashMap fromString(String value) throws IllegalArgumentException { LinkedHashMap conversionPairs = new LinkedHashMap(); final String[] replacements = value.split(",\\s*"); for (final String stringPair : replacements) { final String[] twoStrings = stringPair.trim().split(" "); if (twoStrings.length == 2) { if (!conversionPairs.containsKey(twoStrings[0])) { conversionPairs.put(twoStrings[0], twoStrings[1]); } else { throw new IllegalArgumentException( "Input conversion cannot specify different values for the same input string: " + twoStrings[0]); } } else { throw new IllegalArgumentException( "Attribute " + propertyName + " is not in the proper format: " + value); } } return conversionPairs; } }, /** * Replacement pairs for non-obvious candidate search in a speller dictionary. For example, Polish * rz is phonetically equivalent to ż, and this may be specified here to * allow looking for replacements of rz with ż and vice versa. */ REPLACEMENT_PAIRS("fsa.dict.speller.replacement-pairs") { @Override public LinkedHashMap> fromString(String value) throws IllegalArgumentException { LinkedHashMap> replacementPairs = new LinkedHashMap<>(); final String[] replacements = value.split(",\\s*"); for (final String stringPair : replacements) { final String[] twoStrings = stringPair.trim().split(" "); if (twoStrings.length == 2) { // _ represents a space (hunspell REP convention) String key = twoStrings[0].replace('_', ' '); String val = twoStrings[1].replace('_', ' '); if (!replacementPairs.containsKey(key)) { List strList = new ArrayList(); strList.add(val); replacementPairs.put(key, strList); } else { replacementPairs.get(key).add(val); } } else { throw new IllegalArgumentException( "Attribute " + propertyName + " is not in the proper format: " + value); } } return replacementPairs; } }, /** * Equivalent characters (treated similarly as equivalent chars with and without diacritics). For * example, Polish ł can be specified as equivalent to l. * *

This implements a feature similar to hunspell MAP in the affix file. */ EQUIVALENT_CHARS("fsa.dict.speller.equivalent-chars") { @Override public LinkedHashMap> fromString(String value) throws IllegalArgumentException { LinkedHashMap> equivalentCharacters = new LinkedHashMap<>(); final String[] eqChars = value.split(",\\s*"); for (final String characterPair : eqChars) { final String[] twoChars = characterPair.trim().split(" "); if (twoChars.length == 2 && twoChars[0].length() == 1 && twoChars[1].length() == 1) { char fromChar = twoChars[0].charAt(0); char toChar = twoChars[1].charAt(0); if (!equivalentCharacters.containsKey(fromChar)) { List chList = new ArrayList(); equivalentCharacters.put(fromChar, chList); } equivalentCharacters.get(fromChar).add(toChar); } else { throw new IllegalArgumentException( "Attribute " + propertyName + " is not in the proper format: " + value); } } return equivalentCharacters; } }, /** Dictionary license attribute. */ LICENSE("fsa.dict.license"), /** Dictionary author. */ AUTHOR("fsa.dict.author"), /** Dictionary creation date. */ CREATION_DATE("fsa.dict.created"); /** Property name for this attribute. */ public final String propertyName; /** * Converts a string to the given attribute's value. * * @param value The value to convert to an attribute value. * @return Returns the attribute's value converted from a string. * @throws IllegalArgumentException If the input string cannot be converted to the attribute's * value. */ public Object fromString(String value) throws IllegalArgumentException { return value; } /** * @param propertyName The property of a {@link DictionaryAttribute}. * @return Return a {@link DictionaryAttribute} associated with a given {@link #propertyName}. */ public static DictionaryAttribute fromPropertyName(String propertyName) { DictionaryAttribute value = attrsByPropertyName.get(propertyName); if (value == null) { throw new IllegalArgumentException("No attribute for property: " + propertyName); } return value; } private static final Map attrsByPropertyName; static { attrsByPropertyName = new HashMap(); for (DictionaryAttribute attr : DictionaryAttribute.values()) { if (attrsByPropertyName.put(attr.propertyName, attr) != null) { throw new RuntimeException("Duplicate property key for: " + attr); } } } /** Private enum instance constructor. */ private DictionaryAttribute(String propertyName) { this.propertyName = propertyName; } private static Boolean booleanValue(String value) { value = value.toLowerCase(Locale.ROOT); if ("true".equals(value) || "yes".equals(value) || "on".equals(value)) { return Boolean.TRUE; } if ("false".equals(value) || "no".equals(value) || "off".equals(value)) { return Boolean.FALSE; } throw new IllegalArgumentException("Not a boolean value: " + value); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/DictionaryIterator.java ================================================ package morfologik.stemming; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharsetDecoder; import java.util.Iterator; /** * An iterator over {@link WordData} entries of a {@link Dictionary}. The stems can be decoded from * compressed format or the compressed form can be preserved. */ public final class DictionaryIterator implements Iterator { private final CharsetDecoder decoder; private final Iterator entriesIter; private final WordData entry; private final byte separator; private final boolean decodeStems; private ByteBuffer inflectedBuffer = ByteBuffer.allocate(0); private CharBuffer inflectedCharBuffer = CharBuffer.allocate(0); private ByteBuffer temp = ByteBuffer.allocate(0); private final ISequenceEncoder sequenceEncoder; public DictionaryIterator(Dictionary dictionary, CharsetDecoder decoder, boolean decodeStems) { this.entriesIter = dictionary.fsa.iterator(); this.separator = dictionary.metadata.getSeparator(); this.sequenceEncoder = dictionary.metadata.getSequenceEncoderType().get(); this.decoder = decoder; this.entry = new WordData(decoder); this.decodeStems = decodeStems; } public boolean hasNext() { return entriesIter.hasNext(); } public WordData next() { final ByteBuffer entryBuffer = entriesIter.next(); /* * Entries are typically: inflectedcodedBasetag so try to find this split. */ byte[] ba = entryBuffer.array(); int bbSize = entryBuffer.remaining(); int sepPos; for (sepPos = 0; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) { break; } } if (sepPos == bbSize) { throw new RuntimeException("Invalid dictionary " + "entry format (missing separator)."); } inflectedBuffer = BufferUtils.clearAndEnsureCapacity(inflectedBuffer, sepPos); inflectedBuffer.put(ba, 0, sepPos); inflectedBuffer.flip(); inflectedCharBuffer = BufferUtils.bytesToChars(decoder, inflectedBuffer, inflectedCharBuffer); entry.update(inflectedBuffer, inflectedCharBuffer); temp = BufferUtils.clearAndEnsureCapacity(temp, bbSize - sepPos); sepPos++; temp.put(ba, sepPos, bbSize - sepPos); temp.flip(); ba = temp.array(); bbSize = temp.remaining(); /* * Find the next separator byte's position splitting word form and tag. */ assert sequenceEncoder.prefixBytes() <= bbSize : sequenceEncoder.getClass() + " >? " + bbSize; sepPos = sequenceEncoder.prefixBytes(); for (; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) break; } /* * Decode the stem into stem buffer. */ if (decodeStems) { entry.stemBuffer = sequenceEncoder.decode(entry.stemBuffer, inflectedBuffer, ByteBuffer.wrap(ba, 0, sepPos)); } else { entry.stemBuffer = BufferUtils.clearAndEnsureCapacity(entry.stemBuffer, sepPos); entry.stemBuffer.put(ba, 0, sepPos); entry.stemBuffer.flip(); } // Skip separator character, if present. if (sepPos + 1 <= bbSize) { sepPos++; } /* * Decode the tag data. */ entry.tagBuffer = BufferUtils.clearAndEnsureCapacity(entry.tagBuffer, bbSize - sepPos); entry.tagBuffer.put(ba, sepPos, bbSize - sepPos); entry.tagBuffer.flip(); return entry; } public void remove() { throw new UnsupportedOperationException(); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/DictionaryLookup.java ================================================ package morfologik.stemming; import static morfologik.fsa.MatchResult.SEQUENCE_IS_A_PREFIX; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import morfologik.fsa.ByteSequenceIterator; import morfologik.fsa.FSA; import morfologik.fsa.FSATraversal; import morfologik.fsa.MatchResult; /** * This class implements a dictionary lookup of an inflected word over a dictionary previously * compiled using the dict_compile tool. */ public final class DictionaryLookup implements IStemmer, Iterable { /** An FSA used for lookups. */ private final FSATraversal matcher; /** An iterator for walking along the final states of {@link #fsa}. */ private final ByteSequenceIterator finalStatesIterator; /** FSA's root node. */ private final int rootNode; /** Expand buffers and arrays by this constant. */ private static final int EXPAND_SIZE = 10; /** Private internal array of reusable word data objects. */ private WordData[] forms = new WordData[0]; /** A "view" over an array implementing */ private final ArrayViewList formsList = new ArrayViewList(forms, 0, forms.length); /** * Features of the compiled dictionary. * * @see DictionaryMetadata */ private final DictionaryMetadata dictionaryMetadata; /** Charset encoder for the FSA. */ private final CharsetEncoder encoder; /** Charset decoder for the FSA. */ private final CharsetDecoder decoder; /** The FSA we are using. */ private final FSA fsa; /** * @see #getSeparatorChar() */ private final char separatorChar; /** Internal reusable buffer for encoding words into byte arrays using {@link #encoder}. */ private ByteBuffer byteBuffer = ByteBuffer.allocate(0); /** Internal reusable buffer for encoding words into byte arrays using {@link #encoder}. */ private CharBuffer charBuffer = CharBuffer.allocate(0); /** Reusable match result. */ private final MatchResult matchResult = new MatchResult(); /** The {@link Dictionary} this lookup is using. */ private final Dictionary dictionary; private final ISequenceEncoder sequenceEncoder; /** * Creates a new object of this class using the given FSA for word lookups and encoding for * converting characters to bytes. * * @param dictionary The dictionary to use for lookups. * @throws IllegalArgumentException if FSA's root node cannot be acquired (dictionary is empty). */ public DictionaryLookup(Dictionary dictionary) throws IllegalArgumentException { this.dictionary = dictionary; this.dictionaryMetadata = dictionary.metadata; this.sequenceEncoder = dictionary.metadata.getSequenceEncoderType().get(); this.rootNode = dictionary.fsa.getRootNode(); this.fsa = dictionary.fsa; this.matcher = new FSATraversal(fsa); this.finalStatesIterator = new ByteSequenceIterator(fsa, fsa.getRootNode()); if (dictionaryMetadata == null) { throw new IllegalArgumentException("Dictionary metadata must not be null."); } decoder = dictionary.metadata.getDecoder(); encoder = dictionary.metadata.getEncoder(); separatorChar = dictionary.metadata.getSeparatorAsChar(); } /** * Searches the automaton for a symbol sequence equal to word, followed by a * separator. The result is a stem (decompressed accordingly to the dictionary's specification) * and an optional tag data. */ @Override public List lookup(CharSequence word) { final byte separator = dictionaryMetadata.getSeparator(); final int prefixBytes = sequenceEncoder.prefixBytes(); if (!dictionaryMetadata.getInputConversionPairs().isEmpty()) { word = applyReplacements(word, dictionaryMetadata.getInputConversionPairs()); } // Reset the output list to zero length. formsList.wrap(forms, 0, 0); // Encode word characters into bytes in the same encoding as the FSA's. charBuffer = BufferUtils.clearAndEnsureCapacity(charBuffer, word.length()); for (int i = 0; i < word.length(); i++) { char chr = word.charAt(i); if (chr == separatorChar) { // No valid input can contain the separator. return formsList; } charBuffer.put(chr); } charBuffer.flip(); try { byteBuffer = BufferUtils.charsToBytes(encoder, charBuffer, byteBuffer); } catch (UnmappableInputException e) { // This should be a rare occurrence, but if it happens it means there is no way // the dictionary can contain the input word. return formsList; } // Try to find a partial match in the dictionary. final MatchResult match = matcher.match(matchResult, byteBuffer.array(), 0, byteBuffer.remaining(), rootNode); if (match.kind == SEQUENCE_IS_A_PREFIX) { /* * The entire sequence exists in the dictionary. A separator should * be the next symbol. */ final int arc = fsa.getArc(match.node, separator); /* * The situation when the arc points to a final node should NEVER * happen. After all, we want the word to have SOME base form. */ if (arc != 0 && !fsa.isArcFinal(arc)) { // There is such a word in the dictionary. Return its base forms. int formsCount = 0; finalStatesIterator.restartFrom(fsa.getEndNode(arc)); while (finalStatesIterator.hasNext()) { final ByteBuffer bb = finalStatesIterator.next(); final byte[] ba = bb.array(); final int bbSize = bb.remaining(); if (formsCount >= forms.length) { forms = Arrays.copyOf(forms, forms.length + EXPAND_SIZE); for (int k = 0; k < forms.length; k++) { if (forms[k] == null) forms[k] = new WordData(decoder); } } /* * Now, expand the prefix/ suffix 'compression' and store * the base form. */ final WordData wordData = forms[formsCount++]; if (dictionaryMetadata.getOutputConversionPairs().isEmpty()) { wordData.update(byteBuffer, word); } else { wordData.update( byteBuffer, applyReplacements(word, dictionaryMetadata.getOutputConversionPairs())); } /* * Find the separator byte's position splitting the inflection instructions * from the tag. */ assert prefixBytes <= bbSize : sequenceEncoder.getClass() + " >? " + bbSize; int sepPos; for (sepPos = prefixBytes; sepPos < bbSize; sepPos++) { if (ba[sepPos] == separator) { break; } } /* * Decode the stem into stem buffer. */ wordData.stemBuffer = sequenceEncoder.decode( wordData.stemBuffer, byteBuffer, ByteBuffer.wrap(ba, 0, sepPos)); // Skip separator character. sepPos++; /* * Decode the tag data. */ final int tagSize = bbSize - sepPos; if (tagSize > 0) { wordData.tagBuffer = BufferUtils.clearAndEnsureCapacity(wordData.tagBuffer, tagSize); wordData.tagBuffer.put(ba, sepPos, tagSize); wordData.tagBuffer.flip(); } } formsList.wrap(forms, 0, formsCount); } } else { /* * this case is somewhat confusing: we should have hit the separator * first... I don't really know how to deal with it at the time * being. */ } return formsList; } /** * Apply partial string replacements from a given map. * *

Useful if the word needs to be normalized somehow (i.e., ligatures, apostrophes and such). * * @param word The word to apply replacements to. * @param replacements A map of replacements (from->to). * @return new string with all replacements applied. */ public static String applyReplacements( CharSequence word, LinkedHashMap replacements) { // quite horrible from performance point of view; this should really be a transducer. StringBuilder sb = new StringBuilder(word); for (final Map.Entry e : replacements.entrySet()) { String key = e.getKey(); int index = sb.indexOf(e.getKey()); while (index != -1) { sb.replace(index, index + key.length(), e.getValue()); index = sb.indexOf(key, index + key.length()); } } return sb.toString(); } /** * Return an iterator over all {@link WordData} entries available in the embedded {@link * Dictionary}. */ @Override public Iterator iterator() { return new DictionaryIterator(dictionary, decoder, true); } /** * @return Return the {@link Dictionary} used by this object. */ public Dictionary getDictionary() { return dictionary; } /** * @return Returns the logical separator character splitting inflected form, lemma correction * token and a tag. Note that this character is a best-effort conversion from a byte in {@link * DictionaryMetadata#separator} and may not be valid in the target encoding (although this is * highly unlikely). */ public char getSeparatorChar() { return separatorChar; } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadata.java ================================================ package morfologik.stemming; import static morfologik.stemming.DictionaryAttribute.*; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Writer; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.CharacterCodingException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CodingErrorAction; import java.nio.charset.UnsupportedCharsetException; import java.nio.file.Path; import java.util.Collections; import java.util.EnumMap; import java.util.EnumSet; import java.util.Enumeration; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; /** Description of attributes, their types and default values. */ public final class DictionaryMetadata { /** Default attribute values. */ private static Map DEFAULT_ATTRIBUTES = new DictionaryMetadataBuilder() .frequencyIncluded(false) .ignorePunctuation() .ignoreNumbers() .ignoreCamelCase() .ignoreAllUppercase() .ignoreDiacritics() .convertCase() .supportRunOnWords() .toMap(); /** Required attributes. */ private static EnumSet REQUIRED_ATTRIBUTES = EnumSet.of(SEPARATOR, ENCODER, ENCODING); /** * A separator character between fields (stem, lemma, form). The character must be within byte * range (FSA uses bytes internally). */ private byte separator; private char separatorChar; /** Encoding used for converting bytes to characters and vice versa. */ private String encoding; private Charset charset; private Locale locale = Locale.getDefault(); /** Replacement pairs for non-obvious candidate search in a speller dictionary. */ private LinkedHashMap> replacementPairs = new LinkedHashMap<>(); /** Conversion pairs for input conversion, for example to replace ligatures. */ private LinkedHashMap inputConversion = new LinkedHashMap<>(); /** Conversion pairs for output conversion, for example to replace ligatures. */ private LinkedHashMap outputConversion = new LinkedHashMap<>(); /** * Equivalent characters (treated similarly as equivalent chars with and without diacritics). For * example, Polish ł can be specified as equivalent to l. * *

This implements a feature similar to hunspell MAP in the affix file. */ private LinkedHashMap> equivalentChars = new LinkedHashMap<>(); /** All attributes. */ private final EnumMap attributes; /** All "enabled" boolean attributes. */ private final EnumMap boolAttributes; /** Sequence encoder. */ private EncoderType encoderType; /** Expected metadata file extension. */ public static final String METADATA_FILE_EXTENSION = "info"; /** * @return Return all metadata attributes. */ public Map getAttributes() { return Collections.unmodifiableMap(attributes); } // Cached attrs. public String getEncoding() { return encoding; } public byte getSeparator() { return separator; } public Locale getLocale() { return locale; } public LinkedHashMap getInputConversionPairs() { return inputConversion; } public LinkedHashMap getOutputConversionPairs() { return outputConversion; } public LinkedHashMap> getReplacementPairs() { return replacementPairs; } public LinkedHashMap> getEquivalentChars() { return equivalentChars; } // Dynamically fetched. public boolean isFrequencyIncluded() { return boolAttributes.get(FREQUENCY_INCLUDED); } public boolean isIgnoringPunctuation() { return boolAttributes.get(IGNORE_PUNCTUATION); } public boolean isIgnoringNumbers() { return boolAttributes.get(IGNORE_NUMBERS); } public boolean isIgnoringCamelCase() { return boolAttributes.get(IGNORE_CAMEL_CASE); } public boolean isIgnoringAllUppercase() { return boolAttributes.get(IGNORE_ALL_UPPERCASE); } public boolean isIgnoringDiacritics() { return boolAttributes.get(IGNORE_DIACRITICS); } public boolean isConvertingCase() { return boolAttributes.get(CONVERT_CASE); } public boolean isSupportingRunOnWords() { return boolAttributes.get(RUN_ON_WORDS); } /** * Create an instance from an attribute map. * * @param attrs A set of {@link DictionaryAttribute} keys and their associated values. * @see DictionaryMetadataBuilder */ public DictionaryMetadata(Map attrs) { this.boolAttributes = new EnumMap(DictionaryAttribute.class); this.attributes = new EnumMap(DictionaryAttribute.class); this.attributes.putAll(attrs); EnumMap attributeMap = new EnumMap(DEFAULT_ATTRIBUTES); attributeMap.putAll(attrs); // Convert some attrs from the map to local fields for performance reasons. EnumSet requiredAttributes = EnumSet.copyOf(REQUIRED_ATTRIBUTES); for (Map.Entry e : attributeMap.entrySet()) { requiredAttributes.remove(e.getKey()); // Run validation and conversion on all of them. Object value = e.getKey().fromString(e.getValue()); switch (e.getKey()) { case ENCODING: this.encoding = e.getValue(); if (!Charset.isSupported(encoding)) { throw new IllegalArgumentException("Encoding not supported on this JVM: " + encoding); } this.charset = (Charset) value; break; case SEPARATOR: this.separatorChar = (Character) value; break; case LOCALE: this.locale = (Locale) value; break; case ENCODER: this.encoderType = (EncoderType) value; break; case INPUT_CONVERSION: { @SuppressWarnings("unchecked") LinkedHashMap gvalue = (LinkedHashMap) value; this.inputConversion = gvalue; } break; case OUTPUT_CONVERSION: { @SuppressWarnings("unchecked") LinkedHashMap gvalue = (LinkedHashMap) value; this.outputConversion = gvalue; } break; case REPLACEMENT_PAIRS: { @SuppressWarnings("unchecked") LinkedHashMap> gvalue = (LinkedHashMap>) value; this.replacementPairs = gvalue; } break; case EQUIVALENT_CHARS: { @SuppressWarnings("unchecked") LinkedHashMap> gvalue = (LinkedHashMap>) value; this.equivalentChars = gvalue; } break; case IGNORE_PUNCTUATION: case IGNORE_NUMBERS: case IGNORE_CAMEL_CASE: case IGNORE_ALL_UPPERCASE: case IGNORE_DIACRITICS: case CONVERT_CASE: case RUN_ON_WORDS: case FREQUENCY_INCLUDED: this.boolAttributes.put(e.getKey(), (Boolean) value); break; case AUTHOR: case LICENSE: case CREATION_DATE: // Just run validation. e.getKey().fromString(e.getValue()); break; default: throw new RuntimeException( "Unexpected code path (attribute should be handled but is not): " + e.getKey()); } } if (!requiredAttributes.isEmpty()) { throw new IllegalArgumentException( "At least one the required attributes was not provided: " + requiredAttributes.toString()); } // Sanity check. CharsetEncoder encoder = getEncoder(); try { ByteBuffer encoded = encoder.encode(CharBuffer.wrap(new char[] {separatorChar})); if (encoded.remaining() > 1) { throw new IllegalArgumentException( "Separator character is not a single byte in encoding " + encoding + ": " + separatorChar); } this.separator = encoded.get(); } catch (CharacterCodingException e) { throw new IllegalArgumentException( "Separator character cannot be converted to a byte in " + encoding + ": " + separatorChar, e); } } /** * @return Returns a new {@link CharsetDecoder} for the {@link #encoding}. */ public CharsetDecoder getDecoder() { try { return charset .newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); } catch (UnsupportedCharsetException e) { throw new RuntimeException("FSA's encoding charset is not supported: " + encoding); } } /** * @return Returns a new {@link CharsetEncoder} for the {@link #encoding}. */ public CharsetEncoder getEncoder() { try { return charset .newEncoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); } catch (UnsupportedCharsetException e) { throw new RuntimeException("FSA's encoding charset is not supported: " + encoding); } } /** * @return Return sequence encoder type. */ public EncoderType getSequenceEncoderType() { return encoderType; } /** * @return Returns the {@link #separator} byte converted to a single char. * @throws RuntimeException if this conversion is for some reason impossible (the byte is a * surrogate pair, FSA's {@link #encoding} is not available). */ public char getSeparatorAsChar() { return separatorChar; } /** * @return A shortcut returning {@link DictionaryMetadataBuilder}. */ public static DictionaryMetadataBuilder builder() { return new DictionaryMetadataBuilder(); } /** * Returns the expected name of the metadata file, based on the name of the dictionary file. The * expected name is resolved by truncating any file extension of name and appending * {@link DictionaryMetadata#METADATA_FILE_EXTENSION}. * * @param dictionaryFile The name of the dictionary (*.dict) file. * @return Returns the expected name of the metadata file. */ public static String getExpectedMetadataFileName(String dictionaryFile) { final int dotIndex = dictionaryFile.lastIndexOf('.'); final String featuresName; if (dotIndex >= 0) { featuresName = dictionaryFile.substring(0, dotIndex) + "." + METADATA_FILE_EXTENSION; } else { featuresName = dictionaryFile + "." + METADATA_FILE_EXTENSION; } return featuresName; } /** * @param dictionary The location of the dictionary file. * @return Returns the expected location of a metadata file. */ public static Path getExpectedMetadataLocation(Path dictionary) { return dictionary.resolveSibling( getExpectedMetadataFileName(dictionary.getFileName().toString())); } /** * Read dictionary metadata from a property file (stream). * * @param metadataStream The stream with metadata. * @return Returns {@link DictionaryMetadata} read from a the stream (property file). * @throws IOException Thrown if an I/O exception occurs. */ public static DictionaryMetadata read(InputStream metadataStream) throws IOException { Map map = new HashMap(); final Properties properties = new Properties(); properties.load(new InputStreamReader(metadataStream, "UTF-8")); // Handle back-compatibility for encoder specification. if (!properties.containsKey(DictionaryAttribute.ENCODER.propertyName)) { boolean hasDeprecated = properties.containsKey("fsa.dict.uses-suffixes") || properties.containsKey("fsa.dict.uses-infixes") || properties.containsKey("fsa.dict.uses-prefixes"); boolean usesSuffixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-suffixes", "true")); boolean usesPrefixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-prefixes", "false")); boolean usesInfixes = Boolean.valueOf(properties.getProperty("fsa.dict.uses-infixes", "false")); final EncoderType encoder; if (usesInfixes) { encoder = EncoderType.INFIX; } else if (usesPrefixes) { encoder = EncoderType.PREFIX; } else if (usesSuffixes) { encoder = EncoderType.SUFFIX; } else { encoder = EncoderType.NONE; } if (!hasDeprecated) { throw new IOException( "Use an explicit " + DictionaryAttribute.ENCODER.propertyName + "=" + encoder.name() + " metadata key: "); } throw new IOException( "Deprecated encoder keys in metadata. Use " + DictionaryAttribute.ENCODER.propertyName + "=" + encoder.name()); } for (Enumeration e = properties.propertyNames(); e.hasMoreElements(); ) { String key = (String) e.nextElement(); map.put(DictionaryAttribute.fromPropertyName(key), properties.getProperty(key)); } return new DictionaryMetadata(map); } /** * Write dictionary attributes (metadata). * * @param writer The writer to write to. * @throws IOException Thrown when an I/O error occurs. */ public void write(Writer writer) throws IOException { final Properties properties = new Properties(); for (Map.Entry e : getAttributes().entrySet()) { properties.setProperty(e.getKey().propertyName, e.getValue()); } properties.store(writer, "# " + getClass().getName()); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/DictionaryMetadataBuilder.java ================================================ package morfologik.stemming; import java.nio.charset.Charset; import java.util.EnumMap; import java.util.List; import java.util.Locale; import java.util.Map; /** Helper class to build {@link DictionaryMetadata} instances. */ public final class DictionaryMetadataBuilder { private final EnumMap attrs = new EnumMap<>(DictionaryAttribute.class); public DictionaryMetadataBuilder separator(char c) { this.attrs.put(DictionaryAttribute.SEPARATOR, Character.toString(c)); return this; } public DictionaryMetadataBuilder encoding(Charset charset) { return encoding(charset.name()); } public DictionaryMetadataBuilder encoding(String charsetName) { this.attrs.put(DictionaryAttribute.ENCODING, charsetName); return this; } public DictionaryMetadataBuilder frequencyIncluded() { return frequencyIncluded(true); } public DictionaryMetadataBuilder frequencyIncluded(boolean v) { this.attrs.put(DictionaryAttribute.FREQUENCY_INCLUDED, Boolean.valueOf(v).toString()); return this; } public DictionaryMetadataBuilder ignorePunctuation() { return ignorePunctuation(true); } public DictionaryMetadataBuilder ignorePunctuation(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_PUNCTUATION, Boolean.valueOf(v).toString()); return this; } public DictionaryMetadataBuilder ignoreNumbers() { return ignoreNumbers(true); } public DictionaryMetadataBuilder ignoreNumbers(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_NUMBERS, Boolean.valueOf(v).toString()); return this; } public DictionaryMetadataBuilder ignoreCamelCase() { return ignoreCamelCase(true); } public DictionaryMetadataBuilder ignoreCamelCase(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_CAMEL_CASE, Boolean.valueOf(v).toString()); return this; } public DictionaryMetadataBuilder ignoreAllUppercase() { return ignoreAllUppercase(true); } public DictionaryMetadataBuilder ignoreAllUppercase(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_ALL_UPPERCASE, Boolean.valueOf(v).toString()); return this; } public DictionaryMetadataBuilder ignoreDiacritics() { return ignoreDiacritics(true); } public DictionaryMetadataBuilder ignoreDiacritics(boolean v) { this.attrs.put(DictionaryAttribute.IGNORE_DIACRITICS, Boolean.valueOf(v).toString()); return this; } public DictionaryMetadataBuilder convertCase() { return convertCase(true); } public DictionaryMetadataBuilder convertCase(boolean v) { this.attrs.put(DictionaryAttribute.CONVERT_CASE, Boolean.valueOf(v).toString()); return this; } public DictionaryMetadataBuilder supportRunOnWords() { return supportRunOnWords(true); } public DictionaryMetadataBuilder supportRunOnWords(boolean v) { this.attrs.put(DictionaryAttribute.RUN_ON_WORDS, Boolean.valueOf(v).toString()); return this; } public DictionaryMetadataBuilder encoder(EncoderType type) { this.attrs.put(DictionaryAttribute.ENCODER, type.name()); return this; } public DictionaryMetadataBuilder locale(Locale locale) { return locale(locale.toString()); } public DictionaryMetadataBuilder locale(String localeName) { this.attrs.put(DictionaryAttribute.LOCALE, localeName); return this; } public DictionaryMetadataBuilder withReplacementPairs( Map> replacementPairs) { StringBuilder builder = new StringBuilder(); for (Map.Entry> e : replacementPairs.entrySet()) { String k = e.getKey(); for (String v : e.getValue()) { if (builder.length() > 0) builder.append(", "); builder.append(k).append(" ").append(v); } } this.attrs.put(DictionaryAttribute.REPLACEMENT_PAIRS, builder.toString()); return this; } public DictionaryMetadataBuilder withEquivalentChars( Map> equivalentChars) { StringBuilder builder = new StringBuilder(); for (Map.Entry> e : equivalentChars.entrySet()) { Character k = e.getKey(); for (Character v : e.getValue()) { if (builder.length() > 0) builder.append(", "); builder.append(k).append(" ").append(v); } } this.attrs.put(DictionaryAttribute.EQUIVALENT_CHARS, builder.toString()); return this; } public DictionaryMetadataBuilder withInputConversionPairs(Map conversionPairs) { StringBuilder builder = new StringBuilder(); for (Map.Entry e : conversionPairs.entrySet()) { String k = e.getKey(); if (builder.length() > 0) builder.append(", "); builder.append(k).append(" ").append(conversionPairs.get(k)); } this.attrs.put(DictionaryAttribute.INPUT_CONVERSION, builder.toString()); return this; } public DictionaryMetadataBuilder withOutputConversionPairs(Map conversionPairs) { StringBuilder builder = new StringBuilder(); for (Map.Entry e : conversionPairs.entrySet()) { String k = e.getKey(); if (builder.length() > 0) builder.append(", "); builder.append(k).append(" ").append(conversionPairs.get(k)); } this.attrs.put(DictionaryAttribute.OUTPUT_CONVERSION, builder.toString()); return this; } public DictionaryMetadataBuilder author(String author) { this.attrs.put(DictionaryAttribute.AUTHOR, author); return this; } public DictionaryMetadataBuilder creationDate(String creationDate) { this.attrs.put(DictionaryAttribute.CREATION_DATE, creationDate); return this; } public DictionaryMetadataBuilder license(String license) { this.attrs.put(DictionaryAttribute.LICENSE, license); return this; } public DictionaryMetadata build() { return new DictionaryMetadata(attrs); } public EnumMap toMap() { return new EnumMap<>(attrs); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/EncoderType.java ================================================ package morfologik.stemming; /** Known {@link ISequenceEncoder}s. */ public enum EncoderType { SUFFIX { @Override public ISequenceEncoder get() { return new TrimSuffixEncoder(); } }, PREFIX { @Override public ISequenceEncoder get() { return new TrimPrefixAndSuffixEncoder(); } }, INFIX { @Override public ISequenceEncoder get() { return new TrimInfixAndSuffixEncoder(); } }, NONE { @Override public ISequenceEncoder get() { return new NoEncoder(); } }; public abstract ISequenceEncoder get(); } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/ISequenceEncoder.java ================================================ package morfologik.stemming; import java.nio.ByteBuffer; /** * The logic of encoding one sequence of bytes relative to another sequence of bytes. The "base" * form and the "derived" form are typically the stem of a word and the inflected form of a word. * *

Derived form encoding helps in making the data for the automaton smaller and more repetitive * (which results in higher compression rates). * *

See example implementation for details. */ public interface ISequenceEncoder { /** * Encodes target relative to source, optionally reusing the provided * {@link ByteBuffer}. * * @param reuse Reuses the provided {@link ByteBuffer} or allocates a new one if there is not * enough remaining space. * @param source The source byte sequence. * @param target The target byte sequence to encode relative to source * @return Returns the {@link ByteBuffer} with encoded target. */ public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target); /** * Decodes encoded relative to source, optionally reusing the provided * {@link ByteBuffer}. * * @param reuse Reuses the provided {@link ByteBuffer} or allocates a new one if there is not * enough remaining space. * @param source The source byte sequence. * @param encoded The {@linkplain #encode previously encoded} byte sequence. * @return Returns the {@link ByteBuffer} with decoded target. */ public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded); /** * The number of encoded form's prefix bytes that should be ignored (needed for separator lookup). * An ugly workaround for GH-85, should be fixed by prior knowledge of whether the dictionary * contains tags; then we can scan for separator right-to-left. * * @see "https://github.com/morfologik/morfologik-stemming/issues/85" */ @Deprecated public int prefixBytes(); } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/IStemmer.java ================================================ package morfologik.stemming; import java.util.List; /** A generic "stemmer" interface in Morfologik. */ public interface IStemmer { /** * Returns a list of {@link WordData} entries for a given word. The returned list is never * null. Depending on the stemmer's implementation the {@link WordData} may carry the stem * and additional information (tag) or just the stem. * *

The returned list and any object it contains are not usable after a subsequent call to this * method. Any data that should be stored in between must be copied by the caller. * * @param word The word (typically inflected) to look up base forms for. * @return A list of {@link WordData} entries (possibly empty). */ public List lookup(CharSequence word); } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/NoEncoder.java ================================================ package morfologik.stemming; import java.nio.ByteBuffer; /** No relative encoding at all (full target form is returned). */ public class NoEncoder implements ISequenceEncoder { @Override public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { reuse = BufferUtils.clearAndEnsureCapacity(reuse, target.remaining()); target.mark(); reuse.put(target).flip(); target.reset(); return reuse; } @Override public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { reuse = BufferUtils.clearAndEnsureCapacity(reuse, encoded.remaining()); encoded.mark(); reuse.put(encoded).flip(); encoded.reset(); return reuse; } @Override public int prefixBytes() { return 0; } @Override public String toString() { return getClass().getSimpleName(); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/TrimInfixAndSuffixEncoder.java ================================================ package morfologik.stemming; import java.nio.ByteBuffer; /** * Encodes dst relative to src by trimming whatever non-equal suffix and * infix src and dst have. The output code is (bytes): * *

 * {X}{L}{K}{suffix}
 * 
* * where src's infix at position (X - 'A') and of length (L - * 'A') should be removed, then (K - 'A') bytes should be trimmed from the end and then * the suffix should be appended to the resulting byte sequence. * *

Examples: * *

 * src: ayz
 * dst: abc
 * encoded: AACbc
 *
 * src: aillent
 * dst: aller
 * encoded: BBCr
 * 
*/ public class TrimInfixAndSuffixEncoder implements ISequenceEncoder { /** Maximum encodable single-byte code. */ private static final int REMOVE_EVERYTHING = 255; private ByteBuffer scratch = ByteBuffer.allocate(0); public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { assert source.hasArray() && source.position() == 0 && source.arrayOffset() == 0; assert target.hasArray() && target.position() == 0 && target.arrayOffset() == 0; // Search for the infix that can we can encode and remove from src // to get a maximum-length prefix of dst. This could be done more efficiently // by running a smarter longest-common-subsequence algorithm and some pruning (?). // // For now, naive loop should do. // There can be only two positions for the infix to delete: // 1) we remove leading bytes, even if they are partially matching (but a longer match // exists somewhere later on). // 2) we leave max. matching prefix and remove non-matching bytes that follow. int maxInfixIndex = 0; int maxSubsequenceLength = BufferUtils.sharedPrefixLength(source, target); int maxInfixLength = 0; for (int i : new int[] {0, maxSubsequenceLength}) { for (int j = 1; j <= source.remaining() - i; j++) { // Compute temporary src with the infix removed. // Concatenate in scratch space for simplicity. final int len2 = source.remaining() - (i + j); scratch = BufferUtils.clearAndEnsureCapacity(scratch, i + len2); scratch.put(source.array(), 0, i); scratch.put(source.array(), i + j, len2); scratch.flip(); int sharedPrefix = BufferUtils.sharedPrefixLength(scratch, target); // Only update maxSubsequenceLength if we will be able to encode it. if (sharedPrefix > 0 && sharedPrefix > maxSubsequenceLength && i < REMOVE_EVERYTHING && j < REMOVE_EVERYTHING) { maxSubsequenceLength = sharedPrefix; maxInfixIndex = i; maxInfixLength = j; } } } int truncateSuffixBytes = source.remaining() - (maxInfixLength + maxSubsequenceLength); // Special case: if we're removing the suffix in the infix code, move it // to the suffix code instead. if (truncateSuffixBytes == 0 && maxInfixIndex + maxInfixLength == source.remaining()) { truncateSuffixBytes = maxInfixLength; maxInfixIndex = maxInfixLength = 0; } if (maxInfixIndex >= REMOVE_EVERYTHING || maxInfixLength >= REMOVE_EVERYTHING || truncateSuffixBytes >= REMOVE_EVERYTHING) { maxInfixIndex = maxSubsequenceLength = 0; maxInfixLength = truncateSuffixBytes = REMOVE_EVERYTHING; } final int len1 = target.remaining() - maxSubsequenceLength; reuse = BufferUtils.clearAndEnsureCapacity(reuse, 3 + len1); reuse.put((byte) ((maxInfixIndex + 'A') & 0xFF)); reuse.put((byte) ((maxInfixLength + 'A') & 0xFF)); reuse.put((byte) ((truncateSuffixBytes + 'A') & 0xFF)); reuse.put(target.array(), maxSubsequenceLength, len1); reuse.flip(); return reuse; } @Override public int prefixBytes() { return 3; } public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { assert encoded.remaining() >= 3; final int p = encoded.position(); int infixIndex = (encoded.get(p) - 'A') & 0xFF; int infixLength = (encoded.get(p + 1) - 'A') & 0xFF; int truncateSuffixBytes = (encoded.get(p + 2) - 'A') & 0xFF; if (infixLength == REMOVE_EVERYTHING || truncateSuffixBytes == REMOVE_EVERYTHING) { infixIndex = 0; infixLength = source.remaining(); truncateSuffixBytes = 0; } final int len1 = source.remaining() - (infixIndex + infixLength + truncateSuffixBytes); final int len2 = encoded.remaining() - 3; reuse = BufferUtils.clearAndEnsureCapacity(reuse, infixIndex + len1 + len2); assert encoded.hasArray() && encoded.position() == 0 && encoded.arrayOffset() == 0; assert source.hasArray() && source.position() == 0 && source.arrayOffset() == 0; reuse.put(source.array(), 0, infixIndex); reuse.put(source.array(), infixIndex + infixLength, len1); reuse.put(encoded.array(), 3, len2); reuse.flip(); return reuse; } @Override public String toString() { return getClass().getSimpleName(); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/TrimPrefixAndSuffixEncoder.java ================================================ package morfologik.stemming; import java.nio.ByteBuffer; /** * Encodes dst relative to src by trimming whatever non-equal suffix and * prefix src and dst have. The output code is (bytes): * *
 * {P}{K}{suffix}
 * 
* * where (P - 'A') bytes should be trimmed from the start of src, (K * - 'A') bytes should be trimmed from the end of src and then the suffix * should be appended to the resulting byte sequence. * *

Examples: * *

 * src: abc
 * dst: abcd
 * encoded: AAd
 *
 * src: abc
 * dst: xyz
 * encoded: ADxyz
 * 
*/ public class TrimPrefixAndSuffixEncoder implements ISequenceEncoder { /** Maximum encodable single-byte code. */ private static final int REMOVE_EVERYTHING = 255; public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { // Search for the maximum matching subsequence that can be encoded. int maxSubsequenceLength = 0; int maxSubsequenceIndex = 0; for (int i = 0; i < source.remaining(); i++) { // prefix at i => shared subsequence (infix) int sharedPrefix = BufferUtils.sharedPrefixLength(source, i, target, 0); // Only update maxSubsequenceLength if we will be able to encode it. if (sharedPrefix > maxSubsequenceLength && i < REMOVE_EVERYTHING && (source.remaining() - (i + sharedPrefix)) < REMOVE_EVERYTHING) { maxSubsequenceLength = sharedPrefix; maxSubsequenceIndex = i; } } // Determine how much to remove (and where) from src to get a prefix of dst. int truncatePrefixBytes = maxSubsequenceIndex; int truncateSuffixBytes = (source.remaining() - (maxSubsequenceIndex + maxSubsequenceLength)); if (truncatePrefixBytes >= REMOVE_EVERYTHING || truncateSuffixBytes >= REMOVE_EVERYTHING) { maxSubsequenceIndex = maxSubsequenceLength = 0; truncatePrefixBytes = truncateSuffixBytes = REMOVE_EVERYTHING; } final int len1 = target.remaining() - maxSubsequenceLength; reuse = BufferUtils.clearAndEnsureCapacity(reuse, 2 + len1); assert target.hasArray() && target.position() == 0 && target.arrayOffset() == 0; reuse.put((byte) ((truncatePrefixBytes + 'A') & 0xFF)); reuse.put((byte) ((truncateSuffixBytes + 'A') & 0xFF)); reuse.put(target.array(), maxSubsequenceLength, len1); reuse.flip(); return reuse; } @Override public int prefixBytes() { return 2; } public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { assert encoded.remaining() >= 2; final int p = encoded.position(); int truncatePrefixBytes = (encoded.get(p) - 'A') & 0xFF; int truncateSuffixBytes = (encoded.get(p + 1) - 'A') & 0xFF; if (truncatePrefixBytes == REMOVE_EVERYTHING || truncateSuffixBytes == REMOVE_EVERYTHING) { truncatePrefixBytes = source.remaining(); truncateSuffixBytes = 0; } assert source.hasArray() && source.position() == 0 && source.arrayOffset() == 0; assert encoded.hasArray() && encoded.position() == 0 && encoded.arrayOffset() == 0; final int len1 = source.remaining() - (truncateSuffixBytes + truncatePrefixBytes); final int len2 = encoded.remaining() - 2; reuse = BufferUtils.clearAndEnsureCapacity(reuse, len1 + len2); reuse.put(source.array(), truncatePrefixBytes, len1); reuse.put(encoded.array(), 2, len2); reuse.flip(); return reuse; } @Override public String toString() { return getClass().getSimpleName(); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/TrimSuffixEncoder.java ================================================ package morfologik.stemming; import java.nio.ByteBuffer; /** * Encodes dst relative to src by trimming whatever non-equal suffix * src has. The output code is (bytes): * *
 * {K}{suffix}
 * 
* * where (K - 'A') bytes should be trimmed from the end of src and then * the suffix should be appended to the resulting byte sequence. * *

Examples: * *

 * src: foo
 * dst: foobar
 * encoded: Abar
 *
 * src: foo
 * dst: bar
 * encoded: Dbar
 * 
*/ public class TrimSuffixEncoder implements ISequenceEncoder { /** Maximum encodable single-byte code. */ private static final int REMOVE_EVERYTHING = 255; public ByteBuffer encode(ByteBuffer reuse, ByteBuffer source, ByteBuffer target) { int sharedPrefix = BufferUtils.sharedPrefixLength(source, target); int truncateBytes = source.remaining() - sharedPrefix; if (truncateBytes >= REMOVE_EVERYTHING) { truncateBytes = REMOVE_EVERYTHING; sharedPrefix = 0; } reuse = BufferUtils.clearAndEnsureCapacity(reuse, 1 + target.remaining() - sharedPrefix); assert target.hasArray() && target.position() == 0 && target.arrayOffset() == 0; final byte suffixTrimCode = (byte) (truncateBytes + 'A'); reuse .put(suffixTrimCode) .put(target.array(), sharedPrefix, target.remaining() - sharedPrefix) .flip(); return reuse; } @Override public int prefixBytes() { return 1; } public ByteBuffer decode(ByteBuffer reuse, ByteBuffer source, ByteBuffer encoded) { assert encoded.remaining() >= 1; int suffixTrimCode = encoded.get(encoded.position()); int truncateBytes = (suffixTrimCode - 'A') & 0xFF; if (truncateBytes == REMOVE_EVERYTHING) { truncateBytes = source.remaining(); } final int len1 = source.remaining() - truncateBytes; final int len2 = encoded.remaining() - 1; reuse = BufferUtils.clearAndEnsureCapacity(reuse, len1 + len2); assert source.hasArray() && source.position() == 0 && source.arrayOffset() == 0; assert encoded.hasArray() && encoded.position() == 0 && encoded.arrayOffset() == 0; reuse.put(source.array(), 0, len1).put(encoded.array(), 1, len2).flip(); return reuse; } @Override public String toString() { return getClass().getSimpleName(); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/UnmappableInputException.java ================================================ package morfologik.stemming; import java.nio.charset.CharacterCodingException; /** * Thrown when some input cannot be mapped using the declared charset (bytes to characters or the * other way around). */ @SuppressWarnings("serial") public final class UnmappableInputException extends Exception { UnmappableInputException(String message, CharacterCodingException cause) { super(message, cause); } } ================================================ FILE: morfologik-stemming/src/main/java/morfologik/stemming/WordData.java ================================================ package morfologik.stemming; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.*; /** * Stem and tag data associated with a given word. * *

Instances of this class are reused and mutable (values returned from {@link #getStem()}, * {@link #getWord()} and other related methods change on subsequent calls to {@link * DictionaryLookup} class that returned a given instance of {@link WordData}. * *

If you need a copy of the stem or tag data for a given word, you have to create a custom * buffer yourself and copy the associated data, perform {@link #clone()} or create strings (they * are immutable) using {@link #getStem()} and then {@link CharSequence#toString()}. * *

For reasons above it makes no sense to use instances of this class in associative containers * or lists. In fact, both {@link #equals(Object)} and {@link #hashCode()} are overridden and throw * exceptions to prevent accidental damage. */ public final class WordData implements Cloneable { /** Error information if somebody puts us in a Java collection. */ private static final String COLLECTIONS_ERROR_MESSAGE = "Not suitable for use" + " in Java collections framework (volatile content). Refer to documentation."; /** Character encoding in internal buffers. */ private final CharsetDecoder decoder; /** Inflected word form data. */ private CharSequence wordCharSequence; /** Character sequence after converting {@link #stemBuffer} using {@link #decoder}. */ private CharBuffer stemCharSequence; /** Character sequence after converting {@link #tagBuffer} using {@link #decoder}. */ private CharBuffer tagCharSequence; /** Byte buffer holding the inflected word form data. */ ByteBuffer wordBuffer; /** Byte buffer holding stem data. */ ByteBuffer stemBuffer; /** Byte buffer holding tag data. */ ByteBuffer tagBuffer; /** Package scope constructor. */ WordData(CharsetDecoder decoder) { this.decoder = decoder; stemBuffer = ByteBuffer.allocate(0); tagBuffer = ByteBuffer.allocate(0); stemCharSequence = CharBuffer.allocate(0); tagCharSequence = CharBuffer.allocate(0); } /** A constructor for tests only. */ WordData(String stem, String tag, String encoding) { this(Charset.forName(encoding).newDecoder()); try { if (stem != null) stemBuffer.put(stem.getBytes(encoding)); if (tag != null) tagBuffer.put(tag.getBytes(encoding)); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } /** * Copy the stem's binary data (no charset decoding) to a custom byte buffer. * *

The buffer is cleared prior to copying and flipped for reading upon returning from this * method. If the buffer is null or not large enough to hold the result, a new buffer is * allocated. * * @param target Target byte buffer to copy the stem buffer to or null if a new * buffer should be allocated. * @return Returns target or the new reallocated buffer. */ public ByteBuffer getStemBytes(ByteBuffer target) { target = BufferUtils.clearAndEnsureCapacity(target, stemBuffer.remaining()); stemBuffer.mark(); target.put(stemBuffer); stemBuffer.reset(); target.flip(); return target; } /** * Copy the tag's binary data (no charset decoding) to a custom byte buffer. * *

The buffer is cleared prior to copying and flipped for reading upon returning from this * method. If the buffer is null or not large enough to hold the result, a new buffer is * allocated. * * @param target Target byte buffer to copy the tag buffer to or null if a new buffer * should be allocated. * @return Returns target or the new reallocated buffer. */ public ByteBuffer getTagBytes(ByteBuffer target) { target = BufferUtils.clearAndEnsureCapacity(target, tagBuffer.remaining()); tagBuffer.mark(); target.put(tagBuffer); tagBuffer.reset(); target.flip(); return target; } /** * Copy the inflected word's binary data (no charset decoding) to a custom byte buffer. * *

The buffer is cleared prior to copying and flipped for reading upon returning from this * method. If the buffer is null or not large enough to hold the result, a new buffer is * allocated. * * @param target Target byte buffer to copy the word buffer to or null if a new * buffer should be allocated. * @return Returns target or the new reallocated buffer. */ public ByteBuffer getWordBytes(ByteBuffer target) { target = BufferUtils.clearAndEnsureCapacity(target, wordBuffer.remaining()); wordBuffer.mark(); target.put(wordBuffer); wordBuffer.reset(); target.flip(); return target; } /** * @return Return tag data decoded to a character sequence or null if no associated * tag data exists. */ public CharSequence getTag() { tagCharSequence = BufferUtils.bytesToChars(decoder, tagBuffer, tagCharSequence); return tagCharSequence.remaining() == 0 ? null : tagCharSequence; } /** * @return Return stem data decoded to a character sequence or null if no associated * stem data exists. */ public CharSequence getStem() { stemCharSequence = BufferUtils.bytesToChars(decoder, stemBuffer, stemCharSequence); return stemCharSequence.remaining() == 0 ? null : stemCharSequence; } /** * @return Return inflected word form data. Usually the parameter passed to {@link * DictionaryLookup#lookup(CharSequence)}. */ public CharSequence getWord() { return wordCharSequence; } /* * */ @Override public boolean equals(Object obj) { throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE); } /* * */ @Override public int hashCode() { throw new UnsupportedOperationException(COLLECTIONS_ERROR_MESSAGE); } @Override public String toString() { return "WordData[" + this.getWord() + "," + this.getStem() + "," + this.getTag() + "]"; } /** * Declare a covariant of {@link Object#clone()} that returns a deep copy of this object. The * content of all internal buffers is copied. */ @Override public WordData clone() { final WordData clone = new WordData(this.decoder); clone.wordCharSequence = cloneCharSequence(wordCharSequence); clone.wordBuffer = getWordBytes(null); clone.stemBuffer = getStemBytes(null); clone.tagBuffer = getTagBytes(null); return clone; } /** Clone char sequences only if not immutable. */ private CharSequence cloneCharSequence(CharSequence chs) { if (chs instanceof String) return chs; return chs.toString(); } void update(ByteBuffer wordBuffer, CharSequence word) { this.stemCharSequence.clear(); this.tagCharSequence.clear(); this.stemBuffer.clear(); this.tagBuffer.clear(); this.wordBuffer = wordBuffer; this.wordCharSequence = word; } } ================================================ FILE: morfologik-stemming/src/test/java/morfologik/stemming/DictionaryLookupTest.java ================================================ package morfologik.stemming; import static org.assertj.core.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; import java.util.LinkedHashMap; import morfologik.fsa.FSA; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; public class DictionaryLookupTest { @Test public void testApplyReplacements() { LinkedHashMap conversion = new LinkedHashMap<>(); conversion.put("'", "`"); conversion.put("fi", "fi"); conversion.put("\\a", "ą"); conversion.put("Barack", "George"); conversion.put("_", "xx"); assertEquals("filut", DictionaryLookup.applyReplacements("filut", conversion)); assertEquals("fizdrygałką", DictionaryLookup.applyReplacements("fizdrygałk\\a", conversion)); assertEquals("George Bush", DictionaryLookup.applyReplacements("Barack Bush", conversion)); assertEquals("xxxxxxxx", DictionaryLookup.applyReplacements("____", conversion)); } @Test public void testRemovedEncoderProperties() throws IOException { final URL url = this.getClass().getResource("test-removed-props.dict"); try { new DictionaryLookup(Dictionary.read(url)); Assertions.fail(); } catch (IOException e) { assertThat(e).hasMessageContaining(DictionaryAttribute.ENCODER.propertyName); } } @Test public void testPrefixDictionaries() throws IOException { final URL url = this.getClass().getResource("test-prefix.dict"); final IStemmer s = new DictionaryLookup(Dictionary.read(url)); assertArrayEquals(new String[] {"Rzeczpospolita", "subst:irreg"}, stem(s, "Rzeczypospolitej")); assertArrayEquals(new String[] {"Rzeczpospolita", "subst:irreg"}, stem(s, "Rzecząpospolitą")); // This word is not in the dictionary. assertNoStemFor(s, "martygalski"); } @Test public void testInputConversion() throws IOException { final URL url = this.getClass().getResource("test-prefix.dict"); final IStemmer s = new DictionaryLookup(Dictionary.read(url)); assertArrayEquals( new String[] {"Rzeczpospolita", "subst:irreg"}, stem(s, "Rzecz\\apospolit\\a")); assertArrayEquals( new String[] {"Rzeczpospolita", "subst:irreg"}, stem(s, "krowa\\apospolit\\a")); } /* */ @Test public void testInfixDictionaries() throws IOException { final URL url = this.getClass().getResource("test-infix.dict"); final IStemmer s = new DictionaryLookup(Dictionary.read(url)); Assertions.assertThat(stem(s, "Rzeczypospolitej")) .containsExactly("Rzeczpospolita", "subst:irreg"); Assertions.assertThat(stem(s, "Rzeczyccy")).containsExactly("Rzeczycki", "adj:pl:nom:m"); Assertions.assertThat(stem(s, "Rzecząpospolitą")) .containsExactly("Rzeczpospolita", "subst:irreg"); // This word is not in the dictionary. assertNoStemFor(s, "martygalski"); // This word uses characters that are outside of the encoding range of the dictionary. assertNoStemFor(s, "Rzeczyckiõh"); } /* */ @Test public void testWordDataIterator() throws IOException { final URL url = this.getClass().getResource("test-infix.dict"); final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); final HashSet entries = new HashSet(); for (WordData wd : s) { entries.add(wd.getWord() + " " + wd.getStem() + " " + wd.getTag()); } // Make sure a sample of the entries is present. Assertions.assertThat(entries) .contains( "Rzekunia Rzekuń subst:sg:gen:m", "Rzeczkowskie Rzeczkowski adj:sg:nom.acc.voc:n+adj:pl:acc.nom.voc:f.n", "Rzecząpospolitą Rzeczpospolita subst:irreg", "Rzeczypospolita Rzeczpospolita subst:irreg", "Rzeczypospolitych Rzeczpospolita subst:irreg", "Rzeczyckiej Rzeczycki adj:sg:gen.dat.loc:f"); } /* */ @Test public void testWordDataCloning() throws IOException { final URL url = this.getClass().getResource("test-infix.dict"); final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); ArrayList words = new ArrayList(); for (WordData wd : s) { WordData clone = wd.clone(); words.add(clone); } // Reiterate and verify that we have the same entries. final DictionaryLookup s2 = new DictionaryLookup(Dictionary.read(url)); int i = 0; for (WordData wd : s2) { WordData clone = words.get(i++); assertEqualSequences(clone.getStem(), wd.getStem()); assertEqualSequences(clone.getTag(), wd.getTag()); assertEqualSequences(clone.getWord(), wd.getWord()); } // Check collections contract. final HashSet entries = new HashSet(); try { entries.add(words.get(0)); Assertions.fail(); } catch (RuntimeException e) { // Expected. } } private void assertEqualSequences(CharSequence s1, CharSequence s2) { assertEquals(s1.toString(), s2.toString()); } /* */ @Test public void testMultibyteEncodingUTF8() throws IOException { final URL url = this.getClass().getResource("test-diacritics-utf8.dict"); Dictionary read = Dictionary.read(url); final IStemmer s = new DictionaryLookup(read); assertArrayEquals(new String[] {"merge", "001"}, stem(s, "mergeam")); assertArrayEquals(new String[] {"merge", "002"}, stem(s, "merseserăm")); } /* */ @Test public void testSynthesis() throws IOException { final URL url = this.getClass().getResource("test-synth.dict"); final IStemmer s = new DictionaryLookup(Dictionary.read(url)); assertArrayEquals(new String[] {"miała", null}, stem(s, "mieć|verb:praet:sg:ter:f:?perf")); assertArrayEquals(new String[] {"a", null}, stem(s, "a|conj")); assertArrayEquals(new String[] {}, stem(s, "dziecko|subst:sg:dat:n")); // This word is not in the dictionary. assertNoStemFor(s, "martygalski"); } /* */ @Test public void testInputWithSeparators() throws IOException { final URL url = this.getClass().getResource("test-separators.dict"); final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); /* * Attemp to reconstruct input sequences using WordData iterator. */ ArrayList sequences = new ArrayList(); for (WordData wd : s) { sequences.add("" + wd.getWord() + " " + wd.getStem() + " " + wd.getTag()); } Collections.sort(sequences); assertEquals("token1 null null", sequences.get(0)); assertEquals("token2 null null", sequences.get(1)); assertEquals("token3 null +", sequences.get(2)); assertEquals("token4 token2 null", sequences.get(3)); assertEquals("token5 token2 null", sequences.get(4)); assertEquals("token6 token2 +", sequences.get(5)); assertEquals("token7 token2 token3+", sequences.get(6)); assertEquals("token8 token2 token3++", sequences.get(7)); } /* */ @Test public void testSeparatorInLookupTerm() throws IOException { FSA fsa = FSA.read(getClass().getResourceAsStream("test-separator-in-lookup.fsa")); DictionaryMetadata metadata = new DictionaryMetadataBuilder() .separator('+') .encoding("iso8859-1") .encoder(EncoderType.INFIX) .build(); final DictionaryLookup s = new DictionaryLookup(new Dictionary(fsa, metadata)); assertEquals(0, s.lookup("l+A").size()); } /* */ @Test public void testGetSeparator() throws IOException { final URL url = this.getClass().getResource("test-separators.dict"); final DictionaryLookup s = new DictionaryLookup(Dictionary.read(url)); assertEquals('+', s.getSeparatorChar()); } /* */ public static String asString(CharSequence s) { if (s == null) return null; return s.toString(); } /* */ public static String[] stem(IStemmer s, String word) { ArrayList result = new ArrayList(); for (WordData wd : s.lookup(word)) { result.add(asString(wd.getStem())); result.add(asString(wd.getTag())); } return result.toArray(new String[result.size()]); } /* */ public static void assertNoStemFor(IStemmer s, String word) { assertArrayEquals(new String[] {}, stem(s, word)); } } ================================================ FILE: morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataBuilderTest.java ================================================ package morfologik.stemming; import java.io.IOException; import java.nio.charset.Charset; import java.util.Collections; import java.util.EnumSet; import java.util.List; import java.util.Locale; import java.util.Set; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; public class DictionaryMetadataBuilderTest { @Test public void testAllConstantsHaveBuilderMethods() throws IOException { Set keySet = new DictionaryMetadataBuilder() .convertCase() .encoding(Charset.defaultCharset()) .encoding("UTF-8") .frequencyIncluded() .ignoreAllUppercase() .ignoreCamelCase() .ignoreDiacritics() .ignoreNumbers() .ignorePunctuation() .separator('+') .supportRunOnWords() .encoder(EncoderType.SUFFIX) .withEquivalentChars(Collections.>emptyMap()) .withReplacementPairs(Collections.>emptyMap()) .withInputConversionPairs(Collections.emptyMap()) .withOutputConversionPairs(Collections.emptyMap()) .locale(Locale.getDefault()) .license("") .author("") .creationDate("") .toMap() .keySet(); Set all = EnumSet.allOf(DictionaryAttribute.class); all.removeAll(keySet); Assertions.assertThat(all).isEmpty(); } } ================================================ FILE: morfologik-stemming/src/test/java/morfologik/stemming/DictionaryMetadataTest.java ================================================ package morfologik.stemming; import com.carrotsearch.randomizedtesting.jupiter.Randomized; import com.carrotsearch.randomizedtesting.jupiter.RandomizedTest; import com.carrotsearch.randomizedtesting.jupiter.generators.RandomPicks; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.StringWriter; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Random; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; @Randomized public class DictionaryMetadataTest extends RandomizedTest { @Test public void testEscapeSeparator() throws IOException { DictionaryMetadata m = DictionaryMetadata.read(getClass().getResourceAsStream("escape-separator.info")); Assertions.assertThat(m.getSeparator()).isEqualTo((byte) '\t'); } @Test public void testUnicodeSeparator() throws IOException { DictionaryMetadata m = DictionaryMetadata.read(getClass().getResourceAsStream("unicode-separator.info")); Assertions.assertThat(m.getSeparator()).isEqualTo((byte) '\t'); } @Test public void testWriteMetadata(Random rnd) throws IOException { StringWriter sw = new StringWriter(); EncoderType encoder = RandomPicks.randomFrom(rnd, EncoderType.values()); Charset encoding = RandomPicks.randomFrom( rnd, Arrays.asList( StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1, StandardCharsets.US_ASCII)); DictionaryMetadata.builder() .encoding(encoding) .encoder(encoder) .separator('|') .build() .write(sw); DictionaryMetadata other = DictionaryMetadata.read( new ByteArrayInputStream(sw.toString().getBytes(StandardCharsets.UTF_8))); Assertions.assertThat(other.getSeparator()).isEqualTo((byte) '|'); Assertions.assertThat(other.getDecoder().charset()).isEqualTo(encoding); Assertions.assertThat(other.getEncoder().charset()).isEqualTo(encoding); Assertions.assertThat(other.getSequenceEncoderType()).isEqualTo(encoder); } } ================================================ FILE: morfologik-stemming/src/test/java/morfologik/stemming/DictionaryTest.java ================================================ package morfologik.stemming; import static org.junit.jupiter.api.Assertions.*; import com.carrotsearch.randomizedtesting.jupiter.Randomized; import com.carrotsearch.randomizedtesting.jupiter.RandomizedTest; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @Randomized public class DictionaryTest extends RandomizedTest { @Test public void testReadFromFile(@TempDir Path tempDir) throws IOException { Path dict = tempDir.resolve("odd name.dict"); Path info = dict.resolveSibling("odd name.info"); try (InputStream dictInput = this.getClass().getResource("test-infix.dict").openStream(); InputStream infoInput = this.getClass().getResource("test-infix.info").openStream()) { Files.copy(dictInput, dict); Files.copy(infoInput, info); } assertNotNull(Dictionary.read(dict.toUri().toURL())); assertNotNull(Dictionary.read(dict)); } } ================================================ FILE: morfologik-stemming/src/test/java/morfologik/stemming/EncodersTest.java ================================================ package morfologik.stemming; import com.carrotsearch.randomizedtesting.jupiter.Randomized; import com.carrotsearch.randomizedtesting.jupiter.RandomizedTest; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; @Randomized public class EncodersTest extends RandomizedTest { @Test public void testSharedPrefix() throws IOException { Assertions.assertThat( BufferUtils.sharedPrefixLength( ByteBuffer.wrap(b("abcdef")), ByteBuffer.wrap(b("abcd__")))) .isEqualTo(4); Assertions.assertThat( BufferUtils.sharedPrefixLength(ByteBuffer.wrap(b("")), ByteBuffer.wrap(b("_")))) .isEqualTo(0); Assertions.assertThat( BufferUtils.sharedPrefixLength( ByteBuffer.wrap(b("abcdef"), 2, 2), ByteBuffer.wrap(b("___cd__"), 3, 2))) .isEqualTo(2); } private static byte[] b(String arg) { byte[] bytes = arg.getBytes(StandardCharsets.UTF_8); Assertions.assertThat(bytes).hasSize(arg.length()); return bytes; } } ================================================ FILE: morfologik-stemming/src/test/java/morfologik/stemming/SequenceEncodersTest.java ================================================ package morfologik.stemming; import com.carrotsearch.randomizedtesting.jupiter.Randomized; import com.carrotsearch.randomizedtesting.jupiter.RandomizedTest; import com.carrotsearch.randomizedtesting.jupiter.generators.RandomStrings; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.Random; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedClass; import org.junit.jupiter.params.provider.EnumSource; @Randomized @ParameterizedClass @EnumSource(EncoderType.class) public class SequenceEncodersTest extends RandomizedTest { private final ISequenceEncoder coder; public SequenceEncodersTest(EncoderType coderType) { this.coder = coderType.get(); } @Test public void testEncodeSuffixOnRandomSequences(Random rnd) { for (int i = 0; i < 10000; i++) { assertRoundtripEncode( rnd, RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 0, 500), RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 0, 500)); } } @Test public void testEncodeSamples(Random rnd) { assertRoundtripEncode(rnd, "", ""); assertRoundtripEncode(rnd, "abc", "ab"); assertRoundtripEncode(rnd, "abc", "abx"); assertRoundtripEncode(rnd, "ab", "abc"); assertRoundtripEncode(rnd, "xabc", "abc"); assertRoundtripEncode(rnd, "axbc", "abc"); assertRoundtripEncode(rnd, "axybc", "abc"); assertRoundtripEncode(rnd, "axybc", "abc"); assertRoundtripEncode(rnd, "azbc", "abcxy"); assertRoundtripEncode(rnd, "Niemcami", "Niemiec"); assertRoundtripEncode(rnd, "Niemiec", "Niemcami"); } private void assertRoundtripEncode(Random rnd, String srcString, String dstString) { ByteBuffer source = ByteBuffer.wrap(srcString.getBytes(StandardCharsets.UTF_8)); ByteBuffer target = ByteBuffer.wrap(dstString.getBytes(StandardCharsets.UTF_8)); ByteBuffer encoded = coder.encode(ByteBuffer.allocate(rnd.nextInt(30)), source, target); ByteBuffer decoded = coder.decode(ByteBuffer.allocate(rnd.nextInt(30)), source, encoded); if (!decoded.equals(target)) { System.out.println("src: " + BufferUtils.toString(source, StandardCharsets.UTF_8)); System.out.println("dst: " + BufferUtils.toString(target, StandardCharsets.UTF_8)); System.out.println("enc: " + BufferUtils.toString(encoded, StandardCharsets.UTF_8)); System.out.println("dec: " + BufferUtils.toString(decoded, StandardCharsets.UTF_8)); Assertions.fail("Mismatch."); } } } ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/escape-separator.info ================================================ # # An escape sequence for the separator. # fsa.dict.separator=\t fsa.dict.encoding=UTF-8 fsa.dict.encoder=suffix ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/test-diacritics-utf8.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=UTF-8 fsa.dict.encoder=suffix ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/test-infix.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=iso-8859-2 fsa.dict.encoder=infix ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/test-prefix.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=iso-8859-2 fsa.dict.encoder=prefix fsa.dict.input-conversion=\\a ą, krowa Rzecz ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/test-removed-props.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=iso-8859-2 fsa.dict.uses-infixes=true ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/test-separator-in-lookup.in ================================================ l+A+LW l+A+NN1d ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=iso8859-1 fsa.dict.encoder=none ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/test-separators.txt ================================================ token1+ token2++ token3+++ token4+token2 token5+token2+ token6+token2++ token7+token2+token3+ token8+token2+token3++ ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/test-synth.info ================================================ # # Dictionary properties. # fsa.dict.separator=+ fsa.dict.encoding=iso-8859-2 fsa.dict.encoder=suffix ================================================ FILE: morfologik-stemming/src/test/resources/morfologik/stemming/unicode-separator.info ================================================ # # An escape sequence for the separator. # fsa.dict.separator=\u0009 fsa.dict.encoding=UTF-8 fsa.dict.encoder=suffix ================================================ FILE: morfologik-tools/pom.xml ================================================ 4.0.0 org.carrot2 morfologik-parent 2.2.0-SNAPSHOT ../pom.xml morfologik-tools jar Morfologik Command Line Tools Morfologik Command Line Tools ../etc/forbidden-apis/signatures.txt org.carrot2.morfologik.tools org.carrot2 morfologik-fsa ${project.version} org.carrot2 morfologik-fsa-builders ${project.version} org.carrot2 morfologik-stemming ${project.version} com.beust jcommander 1.78 org.apache.maven.plugins maven-jar-plugin morfologik.tools.Launcher true maven-assembly-plugin package-zip package single zip src/main/assembly/package.xml false true ${project.artifactId}-${project.version} package-dir package single dir src/main/assembly/package.xml false false ${project.artifactId}-${project.version} de.thetaphi forbiddenapis ${version.forbiddenapis} forbidden-apis jdk-unsafe jdk-deprecated ================================================ FILE: morfologik-tools/src/main/assembly/package.xml ================================================ package true /lib true true true src/main/package false . **/*.txt src/main/package true . **/*.txt unix ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/BinaryInput.java ================================================ package morfologik.tools; import com.beust.jcommander.Parameter; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.List; final class BinaryInput { private static final String ARG_ACCEPT_BOM = "--accept-bom"; private static final String ARG_ACCEPT_CR = "--accept-cr"; private static final String ARG_IGNORE_EMPTY = "--ignore-empty"; private static interface LineConsumer { byte[] process(byte[] buffer, int length); } @Parameter( names = BinaryInput.ARG_ACCEPT_BOM, arity = 0, description = "Accept leading BOM bytes (UTF-8).") private boolean acceptBom; @Parameter( names = BinaryInput.ARG_ACCEPT_CR, arity = 0, description = "Accept CR bytes in input sequences (\\r).") private boolean acceptCr; @Parameter( names = BinaryInput.ARG_IGNORE_EMPTY, arity = 0, description = "Ignore empty lines in the input.") private boolean ignoreEmpty; BinaryInput() {} public BinaryInput(boolean acceptBom, boolean acceptCr, boolean ignoreEmpty) { this.acceptBom = acceptBom; this.acceptCr = acceptCr; this.ignoreEmpty = ignoreEmpty; } List readBinarySequences(Path input, byte separator) throws IOException { final List sequences = new ArrayList<>(); try (InputStream is = new BufferedInputStream(Files.newInputStream(input))) { if (!acceptBom) { is.mark(4); if (is.read() == 0xef && is.read() == 0xbb && is.read() == 0xbf) { throw new ExitStatusException( ExitStatus.ERROR_OTHER, "The input starts with UTF-8 BOM bytes which is most likely not what you want. Use" + " header-less UTF-8 file or override with %s.", ARG_ACCEPT_BOM); } is.reset(); } forAllLines( is, separator, new LineConsumer() { @Override public byte[] process(byte[] buffer, int length) { if (!acceptCr && hasCr(buffer, length)) { throw new ExitStatusException( ExitStatus.ERROR_OTHER, "The input contains \\r byte (CR) which would be encoded as part of the" + " automaton. If this is desired, use %s.", ARG_ACCEPT_CR); } if (length == 0) { if (!ignoreEmpty) { throw new ExitStatusException( ExitStatus.ERROR_OTHER, "The input contains empty sequences." + " If these can be ignored, use --ignore-empty."); } } else { sequences.add(Arrays.copyOf(buffer, length)); } return buffer; } }); } return sequences; } private static boolean hasCr(byte[] seq, int length) { for (int o = length; --o >= 0; ) { if (seq[o] == '\r') { return true; } } return false; } /** Read all byte-separated sequences. */ private static int forAllLines(InputStream is, byte separator, LineConsumer lineConsumer) throws IOException { int lines = 0; byte[] buffer = new byte[0]; int b, pos = 0; while ((b = is.read()) != -1) { if (b == separator) { buffer = lineConsumer.process(buffer, pos); pos = 0; lines++; } else { if (pos >= buffer.length) { buffer = java.util.Arrays.copyOf(buffer, buffer.length + Math.max(10, buffer.length / 10)); } buffer[pos++] = (byte) b; } } if (pos > 0) { lineConsumer.process(buffer, pos); lines++; } return lines; } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/CliTool.java ================================================ package morfologik.tools; import com.beust.jcommander.JCommander; import com.beust.jcommander.MissingCommandException; import com.beust.jcommander.Parameter; import com.beust.jcommander.ParameterException; import com.beust.jcommander.Parameters; import java.io.PrintStream; import java.util.List; import java.util.Locale; import java.util.concurrent.Callable; /** Base class for command-line applications. */ public abstract class CliTool implements Callable { protected static final String ARG_OVERWRITE = "--overwrite"; protected static final String ARG_VALIDATE = "--validate"; @Parameter( names = {"--exit"}, hidden = true, arity = 1, description = "Call System.exit() at the end of command processing.") private boolean callSystemExit = true; @Parameter( names = {"-h", "--help"}, help = true, hidden = true, description = "Help about options and switches.") private boolean help; public CliTool() { if (!getClass().isAnnotationPresent(Parameters.class)) { throw new RuntimeException(); } } /** * Call {@link System#exit(int)} at the end of command processing. * * @param flag Call {@link System#exit(int)} if true. */ public void setCallSystemExit(boolean flag) { this.callSystemExit = flag; } /** * Parse and execute one of the commands. * * @param args Command line arguments (command and options). * @param commands A list of commands. */ protected static void main(String[] args, CliTool... commands) { if (commands.length == 1) { main(args, commands[0]); } else { JCommander jc = new JCommander(); for (CliTool command : commands) { jc.addCommand(command); } jc.addConverterFactory(new CustomParameterConverters()); jc.setProgramName(""); ExitStatus exitStatus = ExitStatus.SUCCESS; try { jc.parse(args); final String commandName = jc.getParsedCommand(); if (commandName == null) { helpDisplayCommandOptions(System.err, jc); } else { List objects = jc.getCommands().get(commandName).getObjects(); if (objects.size() != 1) { throw new RuntimeException(); } CliTool command = CliTool.class.cast(objects.get(0)); exitStatus = command.call(); if (command.callSystemExit) { System.exit(exitStatus.code); } } } catch (ExitStatusException e) { System.err.println(e.getMessage()); if (e.getCause() != null) { e.getCause().printStackTrace(System.err); } exitStatus = e.exitStatus; } catch (MissingCommandException e) { System.err.println("Invalid argument: " + e); System.err.println(); helpDisplayCommandOptions(System.err, jc); exitStatus = ExitStatus.ERROR_INVALID_ARGUMENTS; } catch (ParameterException e) { System.err.println("Invalid argument: " + e.getMessage()); System.err.println(); if (jc.getParsedCommand() == null) { helpDisplayCommandOptions(System.err, jc); } else { helpDisplayCommandOptions(System.err, jc.getParsedCommand(), jc); } exitStatus = ExitStatus.ERROR_INVALID_ARGUMENTS; } catch (Throwable t) { System.err.println("An unhandled exception occurred. Stack trace below."); t.printStackTrace(System.err); exitStatus = ExitStatus.ERROR_OTHER; } } } /** * Parse and execute a single command. * * @param args Command line arguments (command and options). * @param command The command to execute. */ protected static void main(String[] args, CliTool command) { JCommander jc = new JCommander(command); jc.addConverterFactory(new CustomParameterConverters()); jc.setProgramName(command.getClass().getAnnotation(Parameters.class).commandNames()[0]); ExitStatus exitStatus = ExitStatus.SUCCESS; try { jc.parse(args); if (command.help) { helpDisplayCommandOptions(System.err, jc); } else { exitStatus = command.call(); } } catch (ExitStatusException e) { System.err.println(e.getMessage()); if (e.getCause() != null) { e.getCause().printStackTrace(System.err); } exitStatus = e.exitStatus; } catch (MissingCommandException e) { System.err.println("Invalid argument: " + e); System.err.println(); helpDisplayCommandOptions(System.err, jc); exitStatus = ExitStatus.ERROR_INVALID_ARGUMENTS; } catch (ParameterException e) { System.err.println("Invalid argument: " + e.getMessage()); System.err.println(); if (jc.getParsedCommand() == null) { helpDisplayCommandOptions(System.err, jc); } else { helpDisplayCommandOptions(System.err, jc.getParsedCommand(), jc); } exitStatus = ExitStatus.ERROR_INVALID_ARGUMENTS; } catch (Throwable t) { System.err.println("An unhandled exception occurred. Stack trace below."); t.printStackTrace(System.err); exitStatus = ExitStatus.ERROR_OTHER; } if (command.callSystemExit) { System.exit(exitStatus.code); } } protected static void printf(String msg, Object... args) { System.out.println(String.format(Locale.ROOT, msg, args)); } protected static T checkNotNull(T arg) { if (arg == null) { throw new IllegalArgumentException("Argument must not be null."); } return arg; } private static void helpDisplayCommandOptions(PrintStream pw, String command, JCommander jc) { StringBuilder sb = new StringBuilder(); jc = jc.getCommands().get(command); jc.getUsageFormatter().usage(sb, ""); pw.print(sb); } private static void helpDisplayCommandOptions(PrintStream pw, JCommander jc) { StringBuilder sb = new StringBuilder(); jc.getUsageFormatter().usage(sb, ""); pw.print(sb); } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/CustomParameterConverters.java ================================================ package morfologik.tools; import com.beust.jcommander.IStringConverter; import com.beust.jcommander.IStringConverterFactory; import java.nio.file.Path; import java.nio.file.Paths; class CustomParameterConverters implements IStringConverterFactory { public static class PathConverter implements IStringConverter { @Override public Path convert(String value) { return Paths.get(value); } } @Override public Class> getConverter(Class forType) { if (forType.equals(Path.class)) { return PathConverter.class; } return null; } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/DictApply.java ================================================ package morfologik.tools; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import java.io.BufferedInputStream; import java.io.BufferedReader; import java.io.Closeable; import java.io.Console; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.WordData; /** Applies a morphological dictionary automaton to the input. */ @Parameters( commandNames = "dict_apply", commandDescription = "Applies a dictionary to an input. Each line is considered an input term.") public class DictApply extends CliTool { private static final String ARG_ENCODING = "--input-charset"; @Parameter( names = {"-i", "--input"}, required = false, description = "The input file, each entry in a single line. If not provided, stdin is used.", validateValueWith = ValidateFileExists.class) private Path input; @Parameter( names = {"-d", "--dictionary"}, description = "The dictionary (*.dict and a sibling *.info metadata) to apply.", required = true, validateValueWith = ValidateFileExists.class) private Path dictionary; @Parameter( names = {ARG_ENCODING}, required = false, description = "Character encoding of the input (platform's default).") private String inputEncoding; @Parameter( names = {"--skip-tags"}, required = false, description = "Skip tags in the output, only print base forms if found.") private boolean skipTags = false; private abstract class LineSupplier implements Closeable { public abstract String nextLine() throws IOException; @Override public void close() throws IOException { // No-op by default. } } private class ReaderLineSupplier extends LineSupplier { private final BufferedReader lineReader; public ReaderLineSupplier(BufferedReader reader) { this.lineReader = reader; } @Override public String nextLine() throws IOException { return lineReader.readLine(); } @Override public void close() throws IOException { lineReader.close(); } } DictApply() {} public DictApply(Path dictionary, Path input, String inputEncoding) { this.input = checkNotNull(input); this.dictionary = checkNotNull(dictionary); } @Override public ExitStatus call() throws Exception { ExitStatus exitStatus = validateArguments(); if (exitStatus != null) { return exitStatus; } final DictionaryLookup lookup = new DictionaryLookup(Dictionary.read(this.dictionary)); try (final LineSupplier input = determineInput()) { String line; while ((line = input.nextLine()) != null) { if (line.length() == 0) { continue; } List wordData = lookup.lookup(line); if (wordData.isEmpty()) { System.out.println(line + " => [not found]"); } else { for (WordData wd : wordData) { CharSequence stem = wd.getStem(); CharSequence tag = wd.getTag(); System.out.println( line + " => " + ((skipTags || tag == null) ? stem : stem + " " + tag)); } } } } return ExitStatus.SUCCESS; } private LineSupplier determineInput() throws IOException { if (this.input != null) { return new ReaderLineSupplier( Files.newBufferedReader(this.input, Charset.forName(inputEncoding))); } final Console c = System.console(); if (c != null) { System.err.println( "NOTE: Using Console for input, character encoding is unknown but should be all right."); return new LineSupplier() { @Override public String nextLine() throws IOException { return c.readLine(); } }; } Charset charset = this.inputEncoding != null ? Charset.forName(this.inputEncoding) : Charset.defaultCharset(); System.err.println( "NOTE: Using stdin for input, character encoding set to: " + charset.name() + " (use " + ARG_ENCODING + " to override)."); return new ReaderLineSupplier( new BufferedReader(new InputStreamReader(new BufferedInputStream(System.in), charset))); } private ExitStatus validateArguments() { if (this.input != null) { if (this.inputEncoding == null) { System.err.println("Input encoding is required if file input is used."); return ExitStatus.ERROR_INVALID_ARGUMENTS; } } else { if (System.console() != null && this.inputEncoding != null) { System.err.println("Input encoding is only valid with file input or stdin redirection."); return ExitStatus.ERROR_INVALID_ARGUMENTS; } } return null; } public static void main(String[] args) { main(args, new DictApply()); } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/DictCompile.java ================================================ package morfologik.tools; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import com.beust.jcommander.ParametersDelegate; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.charset.CharsetDecoder; import java.nio.charset.CodingErrorAction; import java.nio.file.Files; import java.nio.file.Path; import java.util.Collections; import java.util.Iterator; import java.util.List; import morfologik.fsa.FSA; import morfologik.fsa.builders.FSABuilder; import morfologik.fsa.builders.FSASerializer; import morfologik.stemming.BufferUtils; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.DictionaryMetadata; import morfologik.stemming.ISequenceEncoder; /** Decompiles morphological dictionary automaton back to source state. */ @Parameters( commandNames = "dict_compile", commandDescription = "Compiles a morphological dictionary automaton.") public class DictCompile extends CliTool { @Parameter( names = {"-i", "--input"}, description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.", required = true, validateValueWith = ValidateFileExists.class) private Path input; @Parameter( names = ARG_VALIDATE, arity = 1, description = "Validate input to make sure it makes sense.") private boolean validate = true; @Parameter( names = {"-f", "--format"}, description = "Automaton serialization format.") private SerializationFormat format = SerializationFormat.FSA5; @Parameter(names = ARG_OVERWRITE, description = "Overwrite the output file if it exists.") private boolean overwrite; @ParametersDelegate private final BinaryInput binaryInput; DictCompile() { binaryInput = new BinaryInput(); } public DictCompile( Path input, boolean overwrite, boolean validate, boolean acceptBom, boolean acceptCr, boolean ignoreEmpty) { this.input = checkNotNull(input); this.overwrite = overwrite; this.validate = validate; this.binaryInput = new BinaryInput(acceptBom, acceptCr, ignoreEmpty); } @Override public ExitStatus call() throws Exception { final Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(input); if (!Files.isRegularFile(metadataPath)) { System.err.println("Dictionary metadata file for the input does not exist: " + metadataPath); System.err.println( "The metadata file (with at least the column separator and byte encoding) " + "is required. Check out the examples."); return ExitStatus.ERROR_OTHER; } final Path output = metadataPath.resolveSibling( metadataPath .getFileName() .toString() .replaceAll("\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict")); if (!overwrite && Files.exists(output)) { throw new ExitStatusException( ExitStatus.ERROR_CONFIRMATION_REQUIRED, "Output dictionary file already exists: %s, use %s to override.", output, ARG_OVERWRITE); } final DictionaryMetadata metadata; try (InputStream is = new BufferedInputStream(Files.newInputStream(metadataPath))) { metadata = DictionaryMetadata.read(is); } final List sequences = binaryInput.readBinarySequences(input, (byte) '\n'); final CharsetDecoder charsetDecoder = metadata .getDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); final byte separator = metadata.getSeparator(); final ISequenceEncoder sequenceEncoder = metadata.getSequenceEncoderType().get(); if (!sequences.isEmpty()) { Iterator i = sequences.iterator(); byte[] row = i.next(); final int separatorCount = countOf(separator, row); if (separatorCount < 1 || separatorCount > 2) { throw new ExitStatusException( ExitStatus.ERROR_OTHER, "Invalid input. Each row must consist of [base,inflected,tag?] columns, where ',' is a" + " separator character (declared as: %s). This row contains %d separator" + " characters: %s", Character.isJavaIdentifierPart(metadata.getSeparatorAsChar()) ? "'" + Character.toString(metadata.getSeparatorAsChar()) + "'" : "0x" + Integer.toHexString((int) separator & 0xff), separatorCount, new String(row, charsetDecoder.charset())); } while (i.hasNext()) { row = i.next(); int count = countOf(separator, row); if (count != separatorCount) { throw new ExitStatusException( ExitStatus.ERROR_OTHER, "The number of separators (%d) is inconsistent with previous lines: %s", count, new String(row, charsetDecoder.charset())); } } } ByteBuffer encoded = ByteBuffer.allocate(0); ByteBuffer source = ByteBuffer.allocate(0); ByteBuffer target = ByteBuffer.allocate(0); ByteBuffer tag = ByteBuffer.allocate(0); ByteBuffer assembled = ByteBuffer.allocate(0); for (int i = 0, max = sequences.size(); i < max; i++) { byte[] row = sequences.get(i); int sep1 = indexOf(separator, row, 0); int sep2 = indexOf(separator, row, sep1 + 1); if (sep2 < 0) { sep2 = row.length; } source = BufferUtils.clearAndEnsureCapacity(source, sep1); source.put(row, 0, sep1); source.flip(); final int len = sep2 - (sep1 + 1); target = BufferUtils.clearAndEnsureCapacity(target, len); target.put(row, sep1 + 1, len); target.flip(); final int len2 = row.length - (sep2 + 1); tag = BufferUtils.clearAndEnsureCapacity(tag, len2); if (len2 > 0) { tag.put(row, sep2 + 1, len2); } tag.flip(); encoded = sequenceEncoder.encode(encoded, target, source); assembled = BufferUtils.clearAndEnsureCapacity( assembled, target.remaining() + 1 + encoded.remaining() + 1 + tag.remaining()); assembled.put(target); assembled.put(separator); assembled.put(encoded); if (tag.hasRemaining()) { assembled.put(separator); assembled.put(tag); } assembled.flip(); sequences.set(i, BufferUtils.toArray(assembled)); } Collections.sort(sequences, FSABuilder.LEXICAL_ORDERING); FSA fsa = FSABuilder.build(sequences); FSASerializer serializer = format.getSerializer(); try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(output))) { serializer.serialize(fsa, os); } // If validating, try to scan the input if (validate) { DictionaryLookup dictionaryLookup = new DictionaryLookup(new Dictionary(fsa, metadata)); for (Iterator i = dictionaryLookup.iterator(); i.hasNext(); i.next()) { // Do nothing, just scan and make sure no exceptions are thrown. } } return ExitStatus.SUCCESS; } private static int countOf(byte separator, byte[] row) { int cnt = 0; for (int i = row.length; --i >= 0; ) { if (row[i] == separator) { cnt++; } } return cnt; } private static int indexOf(byte separator, byte[] row, int fromIndex) { while (fromIndex < row.length) { if (row[fromIndex] == separator) { return fromIndex; } fromIndex++; } return -1; } public static void main(String[] args) { main(args, new DictCompile()); } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/DictDecompile.java ================================================ package morfologik.tools; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import java.io.BufferedOutputStream; import java.io.IOException; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.WordData; /** Decompiles morphological dictionary automaton back to source state. */ @Parameters( commandNames = "dict_decompile", commandDescription = "Decompiles morphological dictionary automaton back to source state.") public class DictDecompile extends CliTool { @Parameter( names = {"-i", "--input"}, description = "The input dictionary (*.dict and a sibling *.info metadata).", required = true, validateValueWith = ValidateFileExists.class) private Path input; @Parameter( names = {"-o", "--output"}, description = "The output file for dictionary data.") private Path output; @Parameter(names = ARG_OVERWRITE, description = "Overwrite the output file if it exists.") private boolean overwrite; @Parameter( names = ARG_VALIDATE, arity = 1, description = "Validate decoded output to make sure it can be re-encoded.") private boolean validate = true; DictDecompile() {} public DictDecompile(Path input, Path output, boolean overwrite, boolean validate) { this.input = checkNotNull(input); this.output = output; this.overwrite = overwrite; this.validate = validate; } @Override public ExitStatus call() throws Exception { final Dictionary dictionary = Dictionary.read(input); final DictionaryLookup lookup = new DictionaryLookup(dictionary); if (output == null) { output = input.resolveSibling( input.getFileName().toString().replaceAll("\\.dict$", "") + ".input"); if (Files.exists(output) && !overwrite) { System.err.println( "ERROR: the default output file location already exists. Use --overwrite or remove" + " the file manually: " + output.toString()); return ExitStatus.ERROR_CONFIRMATION_REQUIRED; } } final byte separator = dictionary.metadata.getSeparator(); ByteBuffer stem = ByteBuffer.allocate(0); ByteBuffer word = ByteBuffer.allocate(0); ByteBuffer tag = ByteBuffer.allocate(0); try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(output))) { boolean hasTags = false; for (WordData wd : lookup) { tag = wd.getTagBytes(tag); if (tag.hasRemaining()) { hasTags = true; break; } } for (WordData wd : lookup) { stem = wd.getStemBytes(stem); word = wd.getWordBytes(word); tag = wd.getTagBytes(tag); write(os, stem); os.write(separator); write(os, word); if (hasTags) { os.write(separator); write(os, tag); } os.write('\n'); if (validate && (ensureNoSeparator(stem, separator) || ensureNoSeparator(word, separator))) { System.err.println( "ERROR: The stem or word of a dictionary entry contains separator " + " byte " + FSAInfo.byteAsChar(separator) + ", this will prevent proper re-encoding." + " Add '--validate false' to override. Offending entry: " + wd.getStem() + ", " + wd.getWord()); return ExitStatus.ERROR_OTHER; } } } return ExitStatus.SUCCESS; } private void write(OutputStream os, ByteBuffer bb) throws IOException { os.write(bb.array(), bb.arrayOffset() + bb.position(), bb.remaining()); } private boolean ensureNoSeparator(ByteBuffer bb, byte marker) { byte[] buf = bb.array(); for (int o = bb.arrayOffset() + bb.position(), i = bb.remaining(); i > 0; i--) { if (buf[o] == marker) { return true; } } return false; } public static void main(String[] args) { main(args, new DictDecompile()); } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/ExitStatus.java ================================================ package morfologik.tools; public enum ExitStatus { /** The command was successful. */ SUCCESS(0), /** Unknown error cause. */ ERROR_OTHER(1), /** Invalid input arguments or their combination. */ ERROR_INVALID_ARGUMENTS(2), /** A potentially destructive command requires explicit confirmation that was not present. */ ERROR_CONFIRMATION_REQUIRED(3); public final int code; private ExitStatus(int systemExitCode) { this.code = systemExitCode; } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/ExitStatusException.java ================================================ package morfologik.tools; import java.util.Locale; @SuppressWarnings("serial") class ExitStatusException extends RuntimeException { final ExitStatus exitStatus; public ExitStatusException(ExitStatus status, String message, Object... args) { this(status, null, message, args); } public ExitStatusException(ExitStatus status, Throwable t, String message, Object... args) { super(String.format(Locale.ROOT, message, args), t); this.exitStatus = status; } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/FSABuild.java ================================================ package morfologik.tools; import com.beust.jcommander.Parameters; @Parameters( hidden = true, commandNames = "fsa_build", commandDescription = "Builds finite state automaton from \\n-delimited input.") @Deprecated public class FSABuild extends FSACompile {} ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/FSACompile.java ================================================ package morfologik.tools; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import com.beust.jcommander.ParametersDelegate; import java.io.BufferedOutputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.Collections; import java.util.List; import morfologik.fsa.FSA; import morfologik.fsa.builders.FSABuilder; import morfologik.fsa.builders.FSASerializer; /** Build finite state automaton out of text input. */ @Parameters( commandNames = {"fsa_compile"}, commandDescription = "Builds finite state automaton from \\n-delimited input.") public class FSACompile extends CliTool { @Parameter( names = {"-i", "--input"}, description = "The input sequences (one sequence per \\n-delimited line).", required = true, validateValueWith = ValidateFileExists.class) private Path input; @Parameter( names = {"-o", "--output"}, description = "The output automaton file.", required = true, validateValueWith = ValidateParentDirExists.class) private Path output; @Parameter( names = {"-f", "--format"}, description = "Automaton serialization format.") private SerializationFormat format = SerializationFormat.FSA5; @ParametersDelegate private final BinaryInput binaryInput; FSACompile() { binaryInput = new BinaryInput(); } public FSACompile( Path input, Path output, SerializationFormat format, boolean acceptBom, boolean acceptCr, boolean ignoreEmpty) { this.input = checkNotNull(input); this.output = checkNotNull(output); this.binaryInput = new BinaryInput(acceptBom, acceptCr, ignoreEmpty); } @Override public ExitStatus call() throws Exception { final List sequences = binaryInput.readBinarySequences(input, (byte) '\n'); Collections.sort(sequences, FSABuilder.LEXICAL_ORDERING); FSA fsa = FSABuilder.build(sequences); FSASerializer serializer = format.getSerializer(); try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(output))) { serializer.serialize(fsa, os); } return ExitStatus.SUCCESS; } public static void main(String[] args) { main(args, new FSACompile()); } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/FSADecompile.java ================================================ package morfologik.tools; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import morfologik.fsa.FSA; /** Dump all byte sequences encoded in a finite state automaton. */ @Parameters( commandNames = "fsa_decompile", commandDescription = "Dumps all sequences encoded in an automaton.") public class FSADecompile extends CliTool { @Parameter( names = {"-i", "--input"}, description = "The input automaton.", required = true, validateValueWith = ValidateFileExists.class) private Path input; @Parameter( names = {"-o", "--output"}, description = "The output file for byte sequences.", required = true, validateValueWith = ValidateParentDirExists.class) private Path output; FSADecompile() {} public FSADecompile(Path input, Path output) { this.input = checkNotNull(input); this.output = checkNotNull(output); } @Override public ExitStatus call() throws Exception { final FSA fsa; try (InputStream is = new BufferedInputStream(Files.newInputStream(input))) { fsa = FSA.read(is); } try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(output))) { for (ByteBuffer bb : fsa) { int o = bb.arrayOffset(); os.write(bb.array(), o + bb.position(), o + bb.remaining()); os.write('\n'); } } return ExitStatus.SUCCESS; } public static void main(String[] args) { main(args, new FSADecompile()); } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/FSADump.java ================================================ package morfologik.tools; import com.beust.jcommander.Parameters; @Parameters( hidden = true, commandNames = "fsa_dump", commandDescription = "Dumps all sequences encoded in an automaton.") @Deprecated public class FSADump extends FSADecompile {} ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/FSAInfo.java ================================================ package morfologik.tools; import com.beust.jcommander.Parameter; import com.beust.jcommander.Parameters; import java.io.BufferedInputStream; import java.io.InputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.Locale; import morfologik.fsa.CFSA; import morfologik.fsa.CFSA2; import morfologik.fsa.FSA; import morfologik.fsa.FSA5; /** Print extra information about a compiled automaton file. */ @Parameters( commandNames = "fsa_info", commandDescription = "Print extra information about a compiled automaton file.") public class FSAInfo extends CliTool { @Parameter( names = {"-i", "--input"}, description = "The input automaton.", required = true, validateValueWith = ValidateFileExists.class) private Path input; FSAInfo() {} public FSAInfo(Path input) { this.input = checkNotNull(input); } @Override public ExitStatus call() throws Exception { final FSA fsa; try (InputStream is = new BufferedInputStream(Files.newInputStream(input))) { fsa = FSA.read(is); } printf("%-25s : %s", "FSA implementation", fsa.getClass().getName()); printf("%-25s : %s", "Compiled with flags", fsa.getFlags().toString()); final morfologik.fsa.builders.FSAInfo info = new morfologik.fsa.builders.FSAInfo(fsa); printf("%-25s : %,d", "Number of arcs (merged)", info.arcsCount); printf("%-25s : %,d", "Number of arcs (total)", info.arcsCountTotal); printf("%-25s : %,d", "Number of nodes", info.nodeCount); printf("%-25s : %,d", "Number of final states", info.finalStatesCount); printf(""); if (fsa instanceof FSA5) { FSA5 fsa5 = (FSA5) fsa; printf("%-25s : %d", "Goto length (GTL)", fsa5.gtl); printf("%-25s : %d", "Node extra data", fsa5.nodeDataLength); printf("%-25s : %s", "Annotation separator", byteAsChar(fsa5.annotation)); printf("%-25s : %s", "Filler character", byteAsChar(fsa5.filler)); } if (fsa instanceof CFSA) { CFSA cfsa = (CFSA) fsa; printf("%-25s : %d", "Goto length (GTL)", cfsa.gtl); printf("%-25s : %d", "Node extra data", cfsa.nodeDataLength); } if (fsa instanceof CFSA2) { CFSA2 cfsa2 = (CFSA2) fsa; byte[] labelMapping = cfsa2.labelMapping; if (labelMapping != null && labelMapping.length > 0) { printf("%-25s :", "Label mapping"); for (int i = 0; i < labelMapping.length; i++) { printf("%-25s %2d -> %s", "", i, byteAsChar(labelMapping[i])); } } } return ExitStatus.SUCCESS; } /** Convert a byte to an informative string. */ static String byteAsChar(byte v) { int chr = v & 0xff; return String.format( Locale.ROOT, "%s (0x%02x)", (Character.isWhitespace(chr) || chr > 127) ? "[non-printable]" : Character.toString((char) chr), v & 0xFF); } public static void main(String[] args) { main(args, new FSAInfo()); } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/Launcher.java ================================================ package morfologik.tools; /** JAR entry point. */ public final class Launcher { private Launcher() {} @SuppressWarnings("deprecation") public static void main(String[] args) { CliTool.main( args, new FSACompile(), new FSADump(), new FSADecompile(), new FSABuild(), new FSAInfo(), new DictCompile(), new DictDecompile(), new DictApply()); } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/SerializationFormat.java ================================================ package morfologik.tools; import morfologik.fsa.builders.CFSA2Serializer; import morfologik.fsa.builders.FSA5Serializer; import morfologik.fsa.builders.FSASerializer; /** The serialization and encoding format to use for compressing the automaton. */ public enum SerializationFormat { FSA5 { @Override FSASerializer getSerializer() { return new FSA5Serializer(); } }, CFSA2 { @Override CFSA2Serializer getSerializer() { return new CFSA2Serializer(); } }; abstract FSASerializer getSerializer(); } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/ValidateFileExists.java ================================================ package morfologik.tools; import com.beust.jcommander.IValueValidator; import com.beust.jcommander.ParameterException; import java.nio.file.Files; import java.nio.file.Path; import java.util.Locale; public final class ValidateFileExists implements IValueValidator { @Override public void validate(String name, Path value) throws ParameterException { if (!Files.exists(value)) { throw new ParameterException( String.format(Locale.ROOT, "%s does not exist: %s", name, value)); } if (!Files.isRegularFile(value)) { throw new ParameterException(String.format(Locale.ROOT, "%s is not a file: %s", name, value)); } if (!Files.isReadable(value)) { throw new ParameterException( String.format(Locale.ROOT, "%s is not readable: %s", name, value)); } } } ================================================ FILE: morfologik-tools/src/main/java/morfologik/tools/ValidateParentDirExists.java ================================================ package morfologik.tools; import com.beust.jcommander.IValueValidator; import com.beust.jcommander.ParameterException; import java.nio.file.Files; import java.nio.file.Path; import java.util.Locale; public final class ValidateParentDirExists implements IValueValidator { @Override public void validate(String name, Path value) throws ParameterException { value = value.toAbsolutePath().normalize().getParent(); if (!Files.exists(value)) { throw new ParameterException( String.format(Locale.ROOT, "Directory does not exist: %s", value)); } if (!Files.isDirectory(value)) { throw new ParameterException( String.format(Locale.ROOT, "Path is not a directory: %s", value)); } if (!Files.isWritable(value)) { throw new ParameterException(String.format(Locale.ROOT, "Path is not writable: %s", value)); } } } ================================================ FILE: morfologik-tools/src/main/package/README.txt ================================================ ${project.artifactId}, ${project.version} Tools for morphological dictionary and finite state automata construction. https://github.com/morfologik Try the examples (each one comes with a simple description of what it does). ================================================ FILE: morfologik-tools/src/main/package/examples/01-fsa-build.input ================================================ black sabbath metallica judas priest ================================================ FILE: morfologik-tools/src/main/package/examples/01-fsa-build.txt ================================================ # This example constructs a finite state automaton (FSA) out # of byte sequences in the input file: # # https://en.wikipedia.org/wiki/Finite-state_machine # # Each sequence is encoded as one path in the automaton. Input are LF-separated # byte sequences. # # This example constructs an automaton serialized with FSA5 (Jan Daciuk's fsa_build compatible format). java -jar ../lib/${project.artifactId}-${project.version}.jar fsa_build --input 01-fsa-build.input --output 01-fsa-build.fsa5 --format fsa5 # This example uses CFSA2, a custom format that is packed slightly better, but slower at runtime. java -jar ../lib/${project.artifactId}-${project.version}.jar fsa_build --input 01-fsa-build.input --output 01-fsa-build.cfsa2 --format cfsa2 ================================================ FILE: morfologik-tools/src/main/package/examples/02-fsa-dump.txt ================================================ # This example dumps byte sequences from a finite # state automaton (created in a previous example), # separating each sequence with a CR byte. java -jar ../lib/${project.artifactId}-${project.version}.jar fsa_dump --input 01-fsa-build.fsa5 --output 02-fsa-dump.output ================================================ FILE: morfologik-tools/src/main/package/examples/03-fsa-info.txt ================================================ # This example prints diagnostic information about # a compiled automaton. echo "FSA5:" java -jar ../lib/${project.artifactId}-${project.version}.jar fsa_info --input 01-fsa-build.fsa5 echo "CFSA2:" java -jar ../lib/${project.artifactId}-${project.version}.jar fsa_info --input 01-fsa-build.cfsa2 ================================================ FILE: morfologik-tools/src/main/package/examples/04-dict-compile.info ================================================ # # Dictionary metadata. A Java property file, read as UTF-8. # # # REQUIRED PROPERTIES # # Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding. fsa.dict.separator=; # The charset in which the input is encoded. UTF-8 is strongly recommended. fsa.dict.encoding=UTF-8 # The type of lemma-inflected form encoding compression that precedes automaton # construction. Allowed values: [suffix, infix, prefix, none]. # Details are in Daciuk's paper and in the code. # Leave at 'prefix' if not sure. fsa.dict.encoder=prefix # # OPTIONAL PROPERTIES # # Author of the dictionary. fsa.dict.author=Acme Inc. # Date the dictionary data was assembled (not compilation time!). fsa.dict.created=2013/10/24 18:18:00 # The license for the dictionary data. fsa.dict.license=(license here) ================================================ FILE: morfologik-tools/src/main/package/examples/04-dict-compile.input ================================================ jawa;jawy;subst:pl:acc:f jawa;jawy;subst:pl:nom:f jawa;jawy;subst:pl:voc:f jawa;jawy;subst:sg:gen:f jawór;jawór;subst:sg:acc:m3+subst:sg:nom:m3 jaw;jawów;subst:pl:gen:m3 jawa;jawą;subst:sg:inst:f jawa;jawę;subst:sg:acc:f ================================================ FILE: morfologik-tools/src/main/package/examples/04-dict-compile.txt ================================================ # # This example compiles a dictionary for use with DictionaryLookup # (dictionary-driven stemming and morphological tag lookup). # # The input file must contain, in each \n-delimited line a sequence of: # # lemma;inflected;tag # # The separator character (byte) is configurable. # The tag is optional. # # Note that, in addition to the input file, the compiler will # also require an associated dictionary "metadata" file, which tells # it how to compress and interpret the input. # # Open and inspect the content of this example's input files: # 04-dict-compile.input # 04-dict-compile.info # java -jar ../lib/${project.artifactId}-${project.version}.jar dict_compile --input 04-dict-compile.input # The compiled dictionary should be written to 04-dict-compile.dict. ================================================ FILE: morfologik-tools/src/main/package/examples/05-dict-decompile.txt ================================================ # # This example decompiles an existing dictionary into # its source form (columns). # # The input file must point at the *.dict file (automaton) and # it must have an associated metadata (*.info) file. # java -jar ../lib/${project.artifactId}-${project.version}.jar dict_decompile --input 04-dict-compile.dict --output 05-dict-decompile.input ================================================ FILE: morfologik-tools/src/test/java/morfologik/tools/DictCompileBug.java ================================================ package morfologik.tools; import com.carrotsearch.randomizedtesting.jupiter.Randomized; import com.carrotsearch.randomizedtesting.jupiter.RandomizedTest; import com.carrotsearch.randomizedtesting.jupiter.generators.RandomNumbers; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.LinkedHashSet; import java.util.Random; import java.util.Set; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.DictionaryMetadata; import morfologik.stemming.EncoderType; import morfologik.stemming.WordData; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @Randomized public class DictCompileBug extends RandomizedTest { @Test public void testSeparatorInEncoded(@TempDir Path tempDir, Random rnd) throws Exception { final Path input = tempDir.resolve("dictionary.input"); final Path metadata = DictionaryMetadata.getExpectedMetadataLocation(input); char separator = '_'; try (Writer writer = Files.newBufferedWriter(metadata, StandardCharsets.UTF_8)) { DictionaryMetadata.builder() .separator(separator) .encoder(EncoderType.SUFFIX) .encoding(StandardCharsets.UTF_8) .build() .write(writer); } Set sequences = new LinkedHashSet<>(); for (int seqs = RandomNumbers.randomIntInRange(rnd, 0, 100); --seqs >= 0; ) { sequences.add("anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB"); sequences.add("Anfragen_anfragen|VER:1:PLU:KJ1:SFT:NEB"); } try (Writer writer = Files.newBufferedWriter(input, StandardCharsets.UTF_8)) { for (String in : sequences) { writer.write(in); writer.write('\n'); } } Assertions.assertThat(new DictCompile(input, false, true, false, false, false).call()) .isEqualTo(ExitStatus.SUCCESS); Path dict = input.resolveSibling("dictionary.dict"); Assertions.assertThat(dict).isRegularFile(); // Verify the dictionary is valid. DictionaryLookup dictionaryLookup = new DictionaryLookup(Dictionary.read(dict)); for (WordData wd : dictionaryLookup) { System.out.println(wd); } } } ================================================ FILE: morfologik-tools/src/test/java/morfologik/tools/DictCompileTest.java ================================================ package morfologik.tools; import com.carrotsearch.randomizedtesting.jupiter.Randomized; import com.carrotsearch.randomizedtesting.jupiter.RandomizedTest; import com.carrotsearch.randomizedtesting.jupiter.generators.RandomNumbers; import com.carrotsearch.randomizedtesting.jupiter.generators.RandomPicks; import com.carrotsearch.randomizedtesting.jupiter.generators.RandomStrings; import java.io.Writer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.LinkedHashSet; import java.util.List; import java.util.Random; import java.util.Set; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.DictionaryMetadata; import morfologik.stemming.EncoderType; import morfologik.stemming.WordData; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.RepeatedTest; import org.junit.jupiter.api.io.TempDir; @Randomized public class DictCompileTest extends RandomizedTest { @RepeatedTest(200) public void testRoundTrip(@TempDir Path tempDir, Random rnd) throws Exception { final Path input = tempDir.resolve("dictionary.input"); final Path metadata = DictionaryMetadata.getExpectedMetadataLocation(input); char separator = RandomPicks.randomFrom( rnd, new Character[] { '|', ',', '\t', }); try (Writer writer = Files.newBufferedWriter(metadata, StandardCharsets.UTF_8)) { DictionaryMetadata.builder() .separator(separator) .encoder(RandomPicks.randomFrom(rnd, EncoderType.values())) .encoding(StandardCharsets.UTF_8) .build() .write(writer); } final boolean useTags = rnd.nextBoolean(); Set sequences = new LinkedHashSet<>(); for (int seqs = RandomNumbers.randomIntInRange(rnd, 0, 100); --seqs >= 0; ) { String base; switch (RandomNumbers.randomIntInRange(rnd, 0, 5)) { case 0: base = RandomStrings.randomAsciiLettersOfLength(rnd, ('A' - separator) & 0xff); break; default: base = RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 1, 100); break; } String inflected; switch (RandomNumbers.randomIntInRange(rnd, 0, 5)) { case 0: inflected = base; break; case 1: inflected = RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 0, 5) + base; break; case 3: inflected = base + RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 0, 5); break; case 4: inflected = RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 0, 5) + base + RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 0, 5); break; default: inflected = RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 0, 200); break; } sequences.add( base + separator + inflected + (useTags ? (separator + RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 0, 10)) : "")); } final boolean ignoreEmpty = rnd.nextBoolean(); try (Writer writer = Files.newBufferedWriter(input, StandardCharsets.UTF_8)) { for (String in : sequences) { writer.write(in); writer.write('\n'); if (ignoreEmpty && rnd.nextBoolean()) { writer.write('\n'); } } } boolean validate = rnd.nextBoolean(); Assertions.assertThat(new DictCompile(input, false, validate, false, false, ignoreEmpty).call()) .isEqualTo(ExitStatus.SUCCESS); Path dict = input.resolveSibling("dictionary.dict"); Assertions.assertThat(dict).isRegularFile(); // Verify the dictionary is valid. DictionaryLookup dictionaryLookup = new DictionaryLookup(Dictionary.read(dict)); Set reconstructed = new LinkedHashSet<>(); for (WordData wd : dictionaryLookup) { reconstructed.add( "" + wd.getStem() + separator + wd.getWord() + (useTags ? separator : "") + (wd.getTag() == null ? "" : wd.getTag())); } Assertions.assertThat(reconstructed).containsOnlyElementsOf(sequences); // Verify decompilation via DictDecompile. // GH-79: if there's only one sequence and there is no tag the decompiler will // drop it. if (useTags && sequences.size() == 1) { String onlyOne = sequences.iterator().next(); if (onlyOne.endsWith(Character.toString(separator))) { sequences.clear(); sequences.add(onlyOne.substring(0, onlyOne.length() - 1)); } } Files.delete(input); Assertions.assertThat(new DictDecompile(dict, null, true, validate).call()) .isEqualTo(ExitStatus.SUCCESS); List allLines = Files.readAllLines(input, StandardCharsets.UTF_8); Assertions.assertThat(allLines).containsOnlyElementsOf(sequences); } } ================================================ FILE: morfologik-tools/src/test/java/morfologik/tools/FSACompileTest.java ================================================ package morfologik.tools; import com.carrotsearch.randomizedtesting.jupiter.Randomized; import com.carrotsearch.randomizedtesting.jupiter.RandomizedTest; import com.carrotsearch.randomizedtesting.jupiter.generators.RandomNumbers; import com.carrotsearch.randomizedtesting.jupiter.generators.RandomPicks; import com.carrotsearch.randomizedtesting.jupiter.generators.RandomStrings; import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.io.PrintStream; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Random; import java.util.Set; import java.util.concurrent.Callable; import morfologik.fsa.FSA; import morfologik.stemming.BufferUtils; import org.assertj.core.api.Assertions; import org.junit.jupiter.api.RepeatedTest; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; @Randomized public class FSACompileTest extends RandomizedTest { @RepeatedTest(100) public void testCliInvocation(@TempDir Path tempDir, Random rnd) throws Exception { final Path input = Files.createTempFile(tempDir, "input", "in"); final Path output = Files.createTempFile(tempDir, "input", "out"); Set sequences = new LinkedHashSet<>(); for (int seqs = RandomNumbers.randomIntInRange(rnd, 0, 100); --seqs >= 0; ) { sequences.add(RandomStrings.randomAsciiLettersOfLengthBetween(rnd, 1, 10)); } try (OutputStream os = Files.newOutputStream(input)) { Iterator i = sequences.iterator(); while (i.hasNext()) { os.write(i.next().getBytes(StandardCharsets.UTF_8)); // Sometimes don't add trailing '\n'. if (!i.hasNext() && rnd.nextBoolean()) { break; } else { os.write('\n'); if (rnd.nextBoolean()) { os.write('\n'); } } } } SerializationFormat format = RandomPicks.randomFrom(rnd, SerializationFormat.values()); Assertions.assertThat(new FSACompile(input, output, format, false, false, true).call()) .isEqualTo(ExitStatus.SUCCESS); try (InputStream is = Files.newInputStream(output)) { FSA fsa = FSA.read(is); Assertions.assertThat(fsa).isNotNull(); Set result = new HashSet<>(); for (ByteBuffer bb : fsa) { result.add(BufferUtils.toString(bb, StandardCharsets.UTF_8)); } Assertions.assertThat(result).containsOnlyElementsOf(sequences); } } @Test public void testEmptyWarning(@TempDir Path tempDir, Random rnd) throws Exception { final Path input = Files.createTempFile(tempDir, "input", "in"); final Path output = Files.createTempFile(tempDir, "input", "out"); Files.write(input, "abc\n\ndef".getBytes(StandardCharsets.US_ASCII)); String out = sysouts( new Callable() { @Override public Void call() throws Exception { FSACompile.main( new String[] { "--exit", "false", "--input", input.toAbsolutePath().toString(), "--output", output.toAbsolutePath().toString() }); return null; } }); Assertions.assertThat(out).contains("--ignore-empty"); } @Test public void testCrWarning(@TempDir Path tempDir, Random rnd) throws Exception { final Path input = Files.createTempFile(tempDir, "input", "in"); final Path output = Files.createTempFile(tempDir, "input", "out"); Files.write(input, "abc\r\ndef\r\n".getBytes(StandardCharsets.US_ASCII)); String out = sysouts( new Callable() { @Override public Void call() throws Exception { FSACompile.main( new String[] { "--exit", "false", "--input", input.toAbsolutePath().toString(), "--output", output.toAbsolutePath().toString() }); return null; } }); Assertions.assertThat(out).contains("CR"); } @Test public void testBomWarning(@TempDir Path tempDir) throws Exception { final Path input = Files.createTempFile(tempDir, "input", "in"); final Path output = Files.createTempFile(tempDir, "input", "out"); // Emit UTF-8 BOM prefixed list of three strings. ByteArrayOutputStream baos = new ByteArrayOutputStream(); baos.write(new byte[] {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF}); baos.write("abc\ndef\nxyz".getBytes(StandardCharsets.UTF_8)); Files.write(input, baos.toByteArray()); String out = sysouts( new Callable() { @Override public Void call() throws Exception { FSACompile.main( new String[] { "--exit", "false", "--input", input.toAbsolutePath().toString(), "--output", output.toAbsolutePath().toString() }); return null; } }); Assertions.assertThat(out).contains("UTF-8 BOM"); } private String sysouts(Callable callable) throws Exception { PrintStream sout = System.out; PrintStream serr = System.err; ByteArrayOutputStream baos = new ByteArrayOutputStream(); PrintStream ps = new PrintStream(baos, true, "UTF-8"); System.setOut(ps); System.setErr(ps); try { callable.call(); return new String(baos.toByteArray(), StandardCharsets.UTF_8); } finally { System.setOut(sout); System.setErr(serr); } } } ================================================ FILE: pom.xml ================================================ 4.0.0 org.carrot2 morfologik-parent 2.2.0-SNAPSHOT pom Morfologik (parent POM) Morfologik is a collection of tools for building finite state automata and stemming/ inflection dictionaries built on top of these. http://morfologik.blogspot.com/ BSD http://www.opensource.org/licenses/bsd-license.php repo Announcements, bug reports, developers mailing list morfologik-devel@lists.sourceforge.net git@github.com:morfologik/morfologik-stemming.git scm:git:git@github.com:morfologik/morfologik-stemming.git scm:git:git@github.com:morfologik/morfologik-stemming.git dawid.weiss Dawid Weiss dawid.weiss@carrotsearch.com marcin.milkowski Marcin Miłkowski 11 UTF-8 3.9.12 3.27.7 0.7.2 6.0.3 0.2.0 3.15.0 3.6.2 3.5.0 3.5.0 3.10 src/forbidden-apis/signatures.txt morfologik-fsa morfologik-fsa-builders morfologik-stemming morfologik-polish morfologik-speller morfologik-tools com.carrotsearch hppc ${version.hppc} com.carrotsearch.randomizedtesting randomizedtesting-jupiter ${version.randomizedtesting} test org.junit.jupiter junit-jupiter ${version.junit} test org.assertj assertj-core ${version.assertj} test org.apache.maven.plugins maven-clean-plugin ${version.maven-clean-plugin} false true ${project.build.directory} eclipse/** idea/** org.apache.maven.plugins maven-deploy-plugin 3.1.4 org.apache.maven.plugins maven-source-plugin 3.4.0 org.apache.maven.plugins maven-javadoc-plugin 3.12.0 src/main/java all,-missing org.apache.maven.plugins maven-antrun-plugin 3.2.0 org.apache.maven.plugins maven-assembly-plugin 3.8.0 org.apache.maven.plugins maven-jar-plugin ${version.maven-jar-plugin} false ${project.groupId} ${project.artifactId} ${project.version} ${project.name} ${project.moduleId} org.apache.maven.plugins maven-install-plugin 3.1.4 org.apache.maven.plugins maven-resources-plugin 3.5.0 org.apache.maven.plugins maven-dependency-plugin 3.10.0 org.apache.maven.plugins maven-compiler-plugin ${version.maven-compiler-plugin} org.apache.maven.plugins maven-gpg-plugin 3.2.8 org.apache.felix maven-bundle-plugin 6.0.2 true org.apache.maven.plugins maven-surefire-plugin 3.5.5 org.apache.maven.plugins maven-enforcer-plugin ${version.maven-enforcer-plugin} com.diffplug.spotless spotless-maven-plugin 3.4.0 1.35.0 true true UNIX check org.apache.maven.plugins maven-jar-plugin com.diffplug.spotless spotless-maven-plugin org.apache.maven.plugins maven-enforcer-plugin enforce-java-version enforce validate [21,) JDK 21 or newer is required to build this project. enforce-dependency-convergence enforce verify org.apache.maven.plugins maven-enforcer-plugin enforce-environment enforce true [${version.maven},) At least Maven ${version.maven}+ required. de.thetaphi forbiddenapis ${version.forbiddenapis} forbidden-apis ${maven.compiler.release} false jdk-unsafe jdk-deprecated jdk-system-out ${forbiddenapis.signaturefile} process-classes check profile.ide.eclipse-m2e m2e.version target/eclipse org.eclipse.m2e lifecycle-mapping 1.0.0 de.thetaphi forbiddenapis [1.0.0,) testCheck check eclipse compile antrun:run maven-antrun-plugin 3.2.0 default-cli none false run org.apache.ant ant 1.10.15 org.eclipse.m2e lifecycle-mapping 1.0.0 de.thetaphi forbiddenapis [0.0.0,) check testCheck com.carrotsearch hppc-template-processor [0.0.0,) template-processor add-source add-test-source false true org.apache.maven.plugins maven-plugin-plugin [3.4,) descriptor helpmojo org.apache.maven.plugins [0.0,) enforce sonatype-oss-release org.sonatype.central central-publishing-maven-plugin 0.10.0 true central morfologik-stemming-${project.version} true published org.apache.maven.plugins maven-gpg-plugin **/*.gz **/*.zip sign org.apache.maven.plugins maven-javadoc-plugin ${project.build.sourceEncoding} ${project.name} v${project.version} API Documentation ${project.name} v${project.version} API Documentation UTF-8 false attach-javadocs jar org.apache.maven.plugins maven-source-plugin true attach-sources jar-no-fork