[
  {
    "path": ".gitignore",
    "content": "*.py[cod]\n\n# C extensions\n*.so\n\n# Packages\n*.egg\n*.egg-info\ndist\nbuild\neggs\nparts\nbin\nvar\nsdist\ndevelop-eggs\n.installed.cfg\nlib\nlib64\nMANIFEST\nenv*\n\n# Installer logs\npip-log.txt\n\n# Unit test / coverage reports\n.coverage\n.tox\nnosetests.xml\n\n# Translations\n*.mo\n\n#Idea IDE\n.idea\n\n# Mr Developer\n.mr.developer.cfg\n.project\n.pydevproject\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright (c) 2012-2019, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions\nare met:\n\n1. Redistributions of source code must retain the above copyright\n   notice, this list of conditions and the following disclaimer.\n2. Redistributions in binary form must reproduce the above copyright\n   notice, this list of conditions and the following disclaimer in the\n   documentation and/or other materials provided with the distribution.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS\nFOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE\nCOPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,\nINCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\nBUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\nLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\nLIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN\nANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE."
  },
  {
    "path": "MANIFEST.in",
    "content": "include LICENSE\ninclude ez_setup.py\ninclude docs/build_requirements.txt"
  },
  {
    "path": "README",
    "content": "Morfessor 2.0 - Quick start\n===========================\n\n\nInstallation\n------------\n\nMorfessor 2.0 is installed using setuptools library for Python. To\nbuild and install the module and scripts to default paths, type\n\npython setup.py install\n\nFor details, see http://docs.python.org/install/\n\n\nDocumentation\n-------------\n\nUser instructions for Morfessor 2.0 are available in the docs directory\nas Sphinx source files (see http://sphinx-doc.org/). Instructions how\nto build the documentation can be found in docs/README.\n\nThe documentation is also available on-line at http://morfessor.readthedocs.org/\n\nDetails of the implemented algorithms and methods and a set of\nexperiments are described in the following technical report:\n\nSami Virpioja, Peter Smit, Stig-Arne Grönroos, and Mikko\nKurimo. Morfessor 2.0: Python Implementation and Extensions for\nMorfessor Baseline. Aalto University publication series SCIENCE +\nTECHNOLOGY, 25/2013. Aalto University, Helsinki, 2013. ISBN\n978-952-60-5501-5.\n\nThe report is available online at \n\nhttp://urn.fi/URN:ISBN:978-952-60-5501-5\n\n\nContact\n-------\n\nQuestions or feedback? Email: morpho@aalto.fi\n"
  },
  {
    "path": "docs/Makefile",
    "content": "# Makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line.\nSPHINXOPTS    =\nSPHINXBUILD   = sphinx-build\nPAPER         =\nBUILDDIR      = build\n\n# User-friendly check for sphinx-build\nifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)\n$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)\nendif\n\n# Internal variables.\nPAPEROPT_a4     = -D latex_paper_size=a4\nPAPEROPT_letter = -D latex_paper_size=letter\nALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source\n# the i18n builder cannot share the environment and doctrees with the others\nI18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source\n\n.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext\n\nhelp:\n\t@echo \"Please use \\`make <target>' where <target> is one of\"\n\t@echo \"  html       to make standalone HTML files\"\n\t@echo \"  dirhtml    to make HTML files named index.html in directories\"\n\t@echo \"  singlehtml to make a single large HTML file\"\n\t@echo \"  pickle     to make pickle files\"\n\t@echo \"  json       to make JSON files\"\n\t@echo \"  htmlhelp   to make HTML files and a HTML help project\"\n\t@echo \"  qthelp     to make HTML files and a qthelp project\"\n\t@echo \"  devhelp    to make HTML files and a Devhelp project\"\n\t@echo \"  epub       to make an epub\"\n\t@echo \"  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter\"\n\t@echo \"  latexpdf   to make LaTeX files and run them through pdflatex\"\n\t@echo \"  latexpdfja to make LaTeX files and run them through platex/dvipdfmx\"\n\t@echo \"  text       to make text files\"\n\t@echo \"  man        to make manual pages\"\n\t@echo \"  texinfo    to make Texinfo files\"\n\t@echo \"  info       to make Texinfo files and run them through makeinfo\"\n\t@echo \"  gettext    to make PO message catalogs\"\n\t@echo \"  changes    to make an overview of all changed/added/deprecated items\"\n\t@echo \"  xml        to make Docutils-native XML files\"\n\t@echo \"  pseudoxml  to make pseudoxml-XML files for display purposes\"\n\t@echo \"  linkcheck  to check all external links for integrity\"\n\t@echo \"  doctest    to run all doctests embedded in the documentation (if enabled)\"\n\nclean:\n\trm -rf $(BUILDDIR)/*\n\nhtml:\n\t$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html\n\t@echo\n\t@echo \"Build finished. The HTML pages are in $(BUILDDIR)/html.\"\n\ndirhtml:\n\t$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml\n\t@echo\n\t@echo \"Build finished. The HTML pages are in $(BUILDDIR)/dirhtml.\"\n\nsinglehtml:\n\t$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml\n\t@echo\n\t@echo \"Build finished. The HTML page is in $(BUILDDIR)/singlehtml.\"\n\npickle:\n\t$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle\n\t@echo\n\t@echo \"Build finished; now you can process the pickle files.\"\n\njson:\n\t$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json\n\t@echo\n\t@echo \"Build finished; now you can process the JSON files.\"\n\nhtmlhelp:\n\t$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp\n\t@echo\n\t@echo \"Build finished; now you can run HTML Help Workshop with the\" \\\n\t      \".hhp project file in $(BUILDDIR)/htmlhelp.\"\n\nqthelp:\n\t$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp\n\t@echo\n\t@echo \"Build finished; now you can run \"qcollectiongenerator\" with the\" \\\n\t      \".qhcp project file in $(BUILDDIR)/qthelp, like this:\"\n\t@echo \"# qcollectiongenerator $(BUILDDIR)/qthelp/Morfessor.qhcp\"\n\t@echo \"To view the help file:\"\n\t@echo \"# assistant -collectionFile $(BUILDDIR)/qthelp/Morfessor.qhc\"\n\ndevhelp:\n\t$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp\n\t@echo\n\t@echo \"Build finished.\"\n\t@echo \"To view the help file:\"\n\t@echo \"# mkdir -p $$HOME/.local/share/devhelp/Morfessor\"\n\t@echo \"# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/Morfessor\"\n\t@echo \"# devhelp\"\n\nepub:\n\t$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub\n\t@echo\n\t@echo \"Build finished. The epub file is in $(BUILDDIR)/epub.\"\n\nlatex:\n\t$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex\n\t@echo\n\t@echo \"Build finished; the LaTeX files are in $(BUILDDIR)/latex.\"\n\t@echo \"Run \\`make' in that directory to run these through (pdf)latex\" \\\n\t      \"(use \\`make latexpdf' here to do that automatically).\"\n\nlatexpdf:\n\t$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex\n\t@echo \"Running LaTeX files through pdflatex...\"\n\t$(MAKE) -C $(BUILDDIR)/latex all-pdf\n\t@echo \"pdflatex finished; the PDF files are in $(BUILDDIR)/latex.\"\n\nlatexpdfja:\n\t$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex\n\t@echo \"Running LaTeX files through platex and dvipdfmx...\"\n\t$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja\n\t@echo \"pdflatex finished; the PDF files are in $(BUILDDIR)/latex.\"\n\ntext:\n\t$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text\n\t@echo\n\t@echo \"Build finished. The text files are in $(BUILDDIR)/text.\"\n\nman:\n\t$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man\n\t@echo\n\t@echo \"Build finished. The manual pages are in $(BUILDDIR)/man.\"\n\ntexinfo:\n\t$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo\n\t@echo\n\t@echo \"Build finished. The Texinfo files are in $(BUILDDIR)/texinfo.\"\n\t@echo \"Run \\`make' in that directory to run these through makeinfo\" \\\n\t      \"(use \\`make info' here to do that automatically).\"\n\ninfo:\n\t$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo\n\t@echo \"Running Texinfo files through makeinfo...\"\n\tmake -C $(BUILDDIR)/texinfo info\n\t@echo \"makeinfo finished; the Info files are in $(BUILDDIR)/texinfo.\"\n\ngettext:\n\t$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale\n\t@echo\n\t@echo \"Build finished. The message catalogs are in $(BUILDDIR)/locale.\"\n\nchanges:\n\t$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes\n\t@echo\n\t@echo \"The overview file is in $(BUILDDIR)/changes.\"\n\nlinkcheck:\n\t$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck\n\t@echo\n\t@echo \"Link check complete; look for any errors in the above output \" \\\n\t      \"or in $(BUILDDIR)/linkcheck/output.txt.\"\n\ndoctest:\n\t$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest\n\t@echo \"Testing of doctests in the sources finished, look at the \" \\\n\t      \"results in $(BUILDDIR)/doctest/output.txt.\"\n\nxml:\n\t$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml\n\t@echo\n\t@echo \"Build finished. The XML files are in $(BUILDDIR)/xml.\"\n\npseudoxml:\n\t$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml\n\t@echo\n\t@echo \"Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml.\"\n"
  },
  {
    "path": "docs/README",
    "content": "Generating Documentation\n------------------------\n\nThe user instructions for Morfessor 2.0 are available as Sphinx source\nfiles (see http://sphinx-doc.org/). To build the documentation you need\nboth the 'sphinx' and the 'sphinxcontrib-napoleon' package. With a recent\nversion of pip you could do::\n\n    pip install -e .[docs]\n\nto automatically install the required dependencies for making the docs.\n\nAfter installing Sphinx, you can generate the documentation in different\nformats using the Makefile or make.bat in the directory \"docs\". For\nexample, to generate a PDF file, type \"make latexpdf\", and to generate\na single HTML file, type \"make singlehtml\". Type \"make help\" to see\nall available formats.\n\nThe documentation can also be read online on http://morfessor.readthedocs.org/"
  },
  {
    "path": "docs/build_requirements.txt",
    "content": "sphinx\nsphinxcontrib-napoleon\n"
  },
  {
    "path": "docs/make.bat",
    "content": "@ECHO OFF\r\n\r\nREM Command file for Sphinx documentation\r\n\r\nif \"%SPHINXBUILD%\" == \"\" (\r\n\tset SPHINXBUILD=sphinx-build\r\n)\r\nset BUILDDIR=build\r\nset ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source\r\nset I18NSPHINXOPTS=%SPHINXOPTS% source\r\nif NOT \"%PAPER%\" == \"\" (\r\n\tset ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%\r\n\tset I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%\r\n)\r\n\r\nif \"%1\" == \"\" goto help\r\n\r\nif \"%1\" == \"help\" (\r\n\t:help\r\n\techo.Please use `make ^<target^>` where ^<target^> is one of\r\n\techo.  html       to make standalone HTML files\r\n\techo.  dirhtml    to make HTML files named index.html in directories\r\n\techo.  singlehtml to make a single large HTML file\r\n\techo.  pickle     to make pickle files\r\n\techo.  json       to make JSON files\r\n\techo.  htmlhelp   to make HTML files and a HTML help project\r\n\techo.  qthelp     to make HTML files and a qthelp project\r\n\techo.  devhelp    to make HTML files and a Devhelp project\r\n\techo.  epub       to make an epub\r\n\techo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter\r\n\techo.  text       to make text files\r\n\techo.  man        to make manual pages\r\n\techo.  texinfo    to make Texinfo files\r\n\techo.  gettext    to make PO message catalogs\r\n\techo.  changes    to make an overview over all changed/added/deprecated items\r\n\techo.  xml        to make Docutils-native XML files\r\n\techo.  pseudoxml  to make pseudoxml-XML files for display purposes\r\n\techo.  linkcheck  to check all external links for integrity\r\n\techo.  doctest    to run all doctests embedded in the documentation if enabled\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"clean\" (\r\n\tfor /d %%i in (%BUILDDIR%\\*) do rmdir /q /s %%i\r\n\tdel /q /s %BUILDDIR%\\*\r\n\tgoto end\r\n)\r\n\r\n\r\n%SPHINXBUILD% 2> nul\r\nif errorlevel 9009 (\r\n\techo.\r\n\techo.The 'sphinx-build' command was not found. Make sure you have Sphinx\r\n\techo.installed, then set the SPHINXBUILD environment variable to point\r\n\techo.to the full path of the 'sphinx-build' executable. Alternatively you\r\n\techo.may add the Sphinx directory to PATH.\r\n\techo.\r\n\techo.If you don't have Sphinx installed, grab it from\r\n\techo.http://sphinx-doc.org/\r\n\texit /b 1\r\n)\r\n\r\nif \"%1\" == \"html\" (\r\n\t%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The HTML pages are in %BUILDDIR%/html.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"dirhtml\" (\r\n\t%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"singlehtml\" (\r\n\t%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"pickle\" (\r\n\t%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished; now you can process the pickle files.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"json\" (\r\n\t%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished; now you can process the JSON files.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"htmlhelp\" (\r\n\t%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished; now you can run HTML Help Workshop with the ^\r\n.hhp project file in %BUILDDIR%/htmlhelp.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"qthelp\" (\r\n\t%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished; now you can run \"qcollectiongenerator\" with the ^\r\n.qhcp project file in %BUILDDIR%/qthelp, like this:\r\n\techo.^> qcollectiongenerator %BUILDDIR%\\qthelp\\Morfessor.qhcp\r\n\techo.To view the help file:\r\n\techo.^> assistant -collectionFile %BUILDDIR%\\qthelp\\Morfessor.ghc\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"devhelp\" (\r\n\t%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"epub\" (\r\n\t%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The epub file is in %BUILDDIR%/epub.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"latex\" (\r\n\t%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished; the LaTeX files are in %BUILDDIR%/latex.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"latexpdf\" (\r\n\t%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex\r\n\tcd %BUILDDIR%/latex\r\n\tmake all-pdf\r\n\tcd %BUILDDIR%/..\r\n\techo.\r\n\techo.Build finished; the PDF files are in %BUILDDIR%/latex.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"latexpdfja\" (\r\n\t%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex\r\n\tcd %BUILDDIR%/latex\r\n\tmake all-pdf-ja\r\n\tcd %BUILDDIR%/..\r\n\techo.\r\n\techo.Build finished; the PDF files are in %BUILDDIR%/latex.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"text\" (\r\n\t%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The text files are in %BUILDDIR%/text.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"man\" (\r\n\t%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The manual pages are in %BUILDDIR%/man.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"texinfo\" (\r\n\t%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"gettext\" (\r\n\t%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The message catalogs are in %BUILDDIR%/locale.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"changes\" (\r\n\t%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.The overview file is in %BUILDDIR%/changes.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"linkcheck\" (\r\n\t%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Link check complete; look for any errors in the above output ^\r\nor in %BUILDDIR%/linkcheck/output.txt.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"doctest\" (\r\n\t%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Testing of doctests in the sources finished, look at the ^\r\nresults in %BUILDDIR%/doctest/output.txt.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"xml\" (\r\n\t%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The XML files are in %BUILDDIR%/xml.\r\n\tgoto end\r\n)\r\n\r\nif \"%1\" == \"pseudoxml\" (\r\n\t%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml\r\n\tif errorlevel 1 exit /b 1\r\n\techo.\r\n\techo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.\r\n\tgoto end\r\n)\r\n\r\n:end\r\n"
  },
  {
    "path": "docs/source/cmdtools.rst",
    "content": "Command line tools\n==================\n\nThe installation process installs 4 scripts in the appropriate PATH.\n\nmorfessor\n---------\nThe morfessor command is a full-featured script for training, updating models\nand segmenting test data.\n\nLoading existing model\n~~~~~~~~~~~~~~~~~~~~~~\n\n``-l <file>``\n    load :ref:`binary-model-def`\n``-L <file>``\n    load :ref:`morfessor1-model-def`\n\n\nLoading data\n~~~~~~~~~~~~\n\n``-t <file>, --traindata <file>``\n    Input corpus file(s) for training (text or bz2/gzipped text; use '-'\n    for standard input; add several times in order to append multiple files).\n    Standard, all sentences are split on whitespace and the tokens are used as\n    compounds. The ``--traindata-list`` option can be used to read all input\n    files as a list of compounds, one compound per line optionally prefixed by\n    a count. See :ref:`data-format-options` for changing the delimiters used for\n    separating compounds and atoms.\n``--traindata-list``\n    Interpret all training files as list files instead of corpus files. A list\n    file contains one compound per line with optionally a count as prefix.\n``-T <file>, --testdata <file>``\n    Input corpus file(s) to analyze (text or bz2/gzipped text; use '-' for\n    standard input; add several times in order to append multiple files). The\n    file is read in the same manner as an input corpus file. See\n    :ref:`data-format-options` for changing the delimiters used for\n    separating compounds and atoms.\n\n\nTraining model options\n~~~~~~~~~~~~~~~~~~~~~~\n\n``-m <mode>, --mode <mode>``\n    Morfessor can run in different modes, each doing different actions on the\n    model. The modes are:\n\n    none\n        Do initialize or train a model. Can be used when just loading a model\n        for segmenting new data\n    init\n        Create new model and load input data. Does not train the model\n    batch\n        Loads an existing model (which is already initialized with training\n        data) and run :ref:`batch-training`\n    init+batch\n        Create a new model, load input data and run :ref:`batch-training`.\n        **Default**\n    online\n        Create a new model, read and train the model concurrently as described\n        in :ref:`online-training`\n    online+batch\n        First read and train the model concurrently as described in\n        :ref:`online-training` and after that retrain the model using\n        :ref:`batch-training`\n\n\n``-a <algorithm>, --algorithm <algorithm>``\n    Algorithm to use for training:\n\n    recursive\n        Recursive as descirbed in :ref:`recursive-training` **Default**\n    viterbi\n        Viterbi as described in :ref:`viterbi-training`\n\n``-d <type>, --dampening <type>``\n    Method for changing the compound counts in the input data. Options:\n\n    none\n        Do not alter the counts of compounds (token based training)\n    log\n        Change the count :math:`x` of a compound to :math:`\\log(x)` (log-token\n        based training)\n    ones\n        Treat all compounds as if they only occured once (type based training)\n\n``-f <list>, --forcesplit <list>``\n    A list of atoms that would always cause the compound to be split. By\n    default only hyphens (``-``) would force a split. Note the notation of the\n    argument list. To have no force split characters, use as an empty string as\n    argument (``-f \"\"``). To split, for example, both hyphen (``-``) and\n    apostrophe (``'``) use ``-f \"-'\"``\n\n``-F <float>, --finish-threshold <float>``\n    Stopping threshold. Training stops when the decrease in model cost of the\n    last iteration is smaller then finish_threshold * #boundaries; (default\n    '0.005')\n\n``-r <seed>, --randseed <seed>``\n    Seed for random number generator\n\n``-R <float>, --randsplit <float>``\n    Initialize new words by random splitting using the\n    given split probability (default no splitting). See :ref:`rand-init`\n\n``--skips``\n    Use random skips for frequently seen compounds to\n    speed up training. See :ref:`rand-init`\n\n``--batch-minfreq <int>``\n    Compound frequency threshold for batch training\n    (default 1)\n``--max-epochs <int>``\n    Hard maximum of epochs in training\n``--nosplit-re <regexp>``\n    If the expression matches the two surrounding\n    characters, do not allow splitting (default None)\n``--online-epochint <int>``\n    Epoch interval for online training (default 10000)\n``--viterbi-smoothing <float>``\n    Additive smoothing parameter for Viterbi training and\n    segmentation (default 0).\n``--viterbi-maxlen <int>``\n    Maximum construction length in Viterbi training and\n    segmentation (default 30)\n\n\nSaving model\n~~~~~~~~~~~~\n\n``-s <file>``\n    save  :ref:`binary-model-def`\n``-S <file>``\n    save  :ref:`morfessor1-model-def`\n``--save-reduced``\n    save :ref:`binary-reduced-model-def`\n\nExamples\n~~~~~~~~\nTraining a model from inputdata.txt, saving a :ref:`morfessor1-model-def` and\nsegmenting the test.txt set: ::\n\n    morfessor -t inputdata.txt -S model.segm -T test.txt\n\nmorfessor-train\n---------------\nThe morfessor-train command is a convenience command that enables easier\ntraining for morfessor models.\n\nThe basic command structure is: ::\n\n    morfessor-train [arguments] traindata-file [traindata-file ...]\n\nThe arguments are identical to the ones for the `morfessor`_ command. The most\nrelevant are:\n\n``-s <file>``\n    save binary model\n``-S <file>``\n    save Morfessor 1.0 style model\n``--save-reduced``\n    save reduced binary model\n\nExamples\n~~~~~~~~\nTrain a morfessor model from a wordcount list in ISO_8859-15, doing type based\ntraining, writing the log to logfile and saving them model as model.bin: ::\n\n    morfessor-train --encoding=ISO_8859-15 --traindata-list --logfile=log.log -s model.bin -d ones traindata.txt\n\nmorfessor-segment\n-----------------\nThe morfessor-segment command is a convenience command that enables easier\nsegmentation of test data with a morfessor model.\n\nThe basic command structure is: ::\n\n    morfessor-segment [arguments] testcorpus-file [testcorpus-file ...]\n\nThe arguments are identical to the ones for the `morfessor`_ command. The most\n relevant are:\n\n``-l <file>``\n    load binary model (normal or reduced)\n``-L <file>``\n    load Morfessor 1.0 style model\n\nExamples\n~~~~~~~~\nLoading a binary model and segmenting the words in testdata.txt: ::\n\n    morfessor-segment -l model.bin testdata.txt\n\nmorfessor-evaluate\n------------------\nThe morfessor-evaluate command is used for evaluating a morfessor model against\na gold-standard. If multiple models are evaluated, it reports statistical\nsignificant differences between them.\n\nThe basic command structure is: ::\n\n    morfessor-evaluate [arguments] <goldstandard> <model> [<model> ...]\n\n\nPositional arguments\n~~~~~~~~~~~~~~~~~~~~\n``<goldstandard>``\n    gold standard file in standard annotation format\n``<model>``\n    model files to segment (either binary or Morfessor 1.0 style segmentation\n    models).\n\nOptional arguments\n~~~~~~~~~~~~~~~~~~\n``-t TEST_SEGMENTATIONS, --testsegmentation TEST_SEGMENTATIONS``\n    Segmentation of the test set. Note that all words in the gold-standard must\n     be segmented\n\n``--num-samples <int>``\n    number of samples to take for testing\n``--sample-size <int>``\n    size of each testing samples\n``--format-string <format>``\n    Python new style format string used to report evaluation results. The\n    following variables are a value and and action separated with and\n    underscore. E.g. fscore_avg for the average f-score. The available\n    values are \"precision\", \"recall\", \"fscore\", \"samplesize\" and the available\n    actions: \"avg\", \"max\", \"min\", \"values\", \"count\". A last meta-data variable\n    (without action) is \"name\", the filename of the model. See also the\n    format-template option for predefined strings.\n``--format-template <template>``\n    Uses a template string for the format-string options. Available templates\n    are: default, table and latex. If format-string is defined this option is\n    ignored.\n\nExamples\n~~~~~~~~\n\nEvaluating three different models against a golden standard, outputting the\nresults in latex table format:::\n\n    morfessor-evaluate --format-template=latex goldstd.txt model1.bin model2.segm model3.bin\n\n.. _data-format-options:\n\nData format command line options\n--------------------------------\n\n\n``--encoding <encoding>``\n    Encoding of input and output files (if none is given, both the local\n    encoding and UTF-8 are tried).\n``--lowercase``\n    lowercase input data\n``--traindata-list``\n    input file(s) for batch training are lists (one compound per line,\n    optionally count as a prefix)\n``--atom-separator <regexp>``\n    atom separator regexp (default None)\n``--compound-separator <regexp>``\n    compound separator regexp (default '\\s+')\n``--analysis-separator <str>``\n    separator for different analyses in an annotation file. Use NONE for only\n    allowing one analysis per line\n``--output-format <format>``\n    format string for --output file (default: '{analysis}\\\\n'). Valid keywords\n    are: ``{analysis}`` = constructions of the compound, ``{compound}`` =\n    compound string, {count} = count of the compound (currently always 1),\n    ``{logprob}`` = log-probability of the analysis, and ``{clogprob}`` =\n    log-probability of the compound. Valid escape sequences are ``\\n`` (newline)\n    and ``\\t`` (tabular)\n``--output-format-separator <str>``\n    construction separator for analysis in --output file (default: ' ')\n``--output-newlines``\n    for each newline in input, print newline in --output file (default: 'False')\n\n\n\n\nUniversal command line options\n------------------------------\n``--verbose <int>  -v``\n    verbose level; controls what is written to the standard error stream or log file (default 1)\n``--logfile <file>``\n    write log messages to file in addition to standard error stream\n``--progressbar``\n    Force the progressbar to be displayed (possibly lowers the log level for the standard error stream)\n``--help``\n    -h show this help message and exit\n``--version``\n    show version number and exit\n\n\n\nMorfessor features\n==================\n\nAll features below are described in a short format, mainly to guide making the\nright choice for a certain parameter. These features are explained in detail in\nthe :ref:`morfessor-tech-report`.\n\n\n.. _`batch-training`:\n\nBatch training\n--------------\nIn batch training, each epoch consists of an iteration over the full training\ndata. Epochs are repeated until the model cost is converged. All training data\nneeded in the training needs to be loaded before the training starts.\n\n.. _`online-training`:\n\nOnline training\n---------------\nIn online training the model is updated while the data is being added. This\nallows for rapid testing and prototyping. All data is only processed once,\nhence it is advisable to run :ref:`batch-training` afterwards. The size of an\nepoch is a fixed, predefined number of compounds processed. The only use of an\nepoch for online training is to select the best annotations in semi-supervised\ntraining.\n\n.. _`recursive-training`:\n\nRecursive training\n------------------\nIn recursive training, each compound is processed in the following manner. The\ncurrent split for the compound is removed from the model and its constructions\nare updated accordingly. After this, all possible splits are tried, by choosing\none split and running the algorithm recursively on the created constructions.\n\nIn the end, the best split is selected and the training continues with the next\ncompound.\n\n.. _`viterbi-training`:\n\nLocal Viterbi training\n----------------------\nIn Local Viterbi training the compounds are processed sequentially. Each\ncompound is removed from the corpus and afterwards segmented using Viterbi\nsegmentation. The result is put back into the model.\n\nIn order to allow new constructions to be created, the smoothing parameter\nmust be given some non-zero value.\n\n.. _`rand-skips`:\n\nRandom skips\n------------\nIn Random skips, frequently seen compounds are skipped in training with a\nrandom probability. As shown in the :ref:`morfessor-tech-report` this speeds\nup the training considerably with only a minor loss in model performance.\n\n.. _`rand-init`:\n\nRandom initialization\n---------------------\nIn random initialization all compounds are split randomly. Each possible\nboundary is made a split with the given probability.\n\nSelecting a good random initialization parameter helps in finding local optima\nas long as the split probability is high enough.\n\n.. _`corpusweight`:\n\nCorpusweight (alpha) tuning\n---------------------------\nAn important parameter of the Morfessor Baseline model is the corpusweight\n(:math:`\\alpha`), which balances the cost of the lexicon and the corpus. There\nare different options available for tuning this weight:\n\nFixed weight (``--corpusweight``)\n    The weight is set fixed on the beginning of the training and does not change\nDevelopment set (``--develset``)\n    A development set is used to balance the corpusweight so that the precision\n    and recall of segmenting the developmentset will be equal\nMorph length (``--morph-length``)\n    The corpusweight is tuned so that the average length of morphs in the\n    lexicon will be as desired\nNum morph types (``--num-morph-types``)\n    The corpusweight is tuned so that there will be approximate the number of\n    desired morph types in the lexicon\n"
  },
  {
    "path": "docs/source/conf.py",
    "content": "# -*- coding: utf-8 -*-\n#\n# Morfessor documentation build configuration file, created by\n# sphinx-quickstart on Wed Dec  4 13:41:43 2013.\n#\n# This file is execfile()d with the current directory set to its\n# containing dir.\n#\n# Note that not all possible configuration values are present in this\n# autogenerated file.\n#\n# All configuration values have a default; values that are commented out\n# serve to show the default.\n\nimport sys\nimport os\n\n# If extensions (or modules to document with autodoc) are in another directory,\n# add these directories to sys.path here. If the directory is relative to the\n# documentation root, use os.path.abspath to make it absolute, like shown here.\n#sys.path.insert(0, os.path.abspath('.'))\n\n# -- General configuration ------------------------------------------------\n\n# If your documentation needs a minimal Sphinx version, state it here.\n#needs_sphinx = '1.0'\n\n# Add any Sphinx extension module names here, as strings. They can be\n# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom\n# ones.\nextensions = [\n    'sphinx.ext.autodoc',\n    'sphinx.ext.mathjax',\n    'sphinxcontrib.napoleon',\n]\n\n# Add any paths that contain templates here, relative to this directory.\ntemplates_path = ['_templates']\n\n# The suffix of source filenames.\nsource_suffix = '.rst'\n\n# The encoding of source files.\n#source_encoding = 'utf-8-sig'\n\n# The master toctree document.\nmaster_doc = 'index'\n\n# General information about the project.\nproject = u'Morfessor'\ncopyright = u'2019, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos'\n\n# The version info for the project you're documenting, acts as replacement for\n# |version| and |release|, also used in various other places throughout the\n# built documents.\n#\n# The short X.Y version.\nversion = '2.0'\n# The full version, including alpha/beta/rc tags.\nrelease = '2.0.6'\n\n# The language for content autogenerated by Sphinx. Refer to documentation\n# for a list of supported languages.\n#language = None\n\n# There are two options for replacing |today|: either, you set today to some\n# non-false value, then it is used:\n#today = ''\n# Else, today_fmt is used as the format for a strftime call.\n#today_fmt = '%B %d, %Y'\n\n# List of patterns, relative to source directory, that match files and\n# directories to ignore when looking for source files.\nexclude_patterns = []\n\n# The reST default role (used for this markup: `text`) to use for all\n# documents.\n#default_role = None\n\n# If true, '()' will be appended to :func: etc. cross-reference text.\n#add_function_parentheses = True\n\n# If true, the current module name will be prepended to all description\n# unit titles (such as .. function::).\n#add_module_names = True\n\n# If true, sectionauthor and moduleauthor directives will be shown in the\n# output. They are ignored by default.\n#show_authors = False\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = 'sphinx'\n\n# A list of ignored prefixes for module index sorting.\n#modindex_common_prefix = []\n\n# If true, keep warnings as \"system message\" paragraphs in the built documents.\n#keep_warnings = False\n\n\n# -- Options for HTML output ----------------------------------------------\n\n# The theme to use for HTML and HTML Help pages.  See the documentation for\n# a list of builtin themes.\nhtml_theme = 'default'\n\n# Theme options are theme-specific and customize the look and feel of a theme\n# further.  For a list of options available for each theme, see the\n# documentation.\n#html_theme_options = {}\n\n# Add any paths that contain custom themes here, relative to this directory.\n#html_theme_path = []\n\n# The name for this set of Sphinx documents.  If None, it defaults to\n# \"<project> v<release> documentation\".\n#html_title = None\n\n# A shorter title for the navigation bar.  Default is the same as html_title.\n#html_short_title = None\n\n# The name of an image file (relative to this directory) to place at the top\n# of the sidebar.\n#html_logo = None\n\n# The name of an image file (within the static path) to use as favicon of the\n# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32\n# pixels large.\n#html_favicon = None\n\n# Add any paths that contain custom static files (such as style sheets) here,\n# relative to this directory. They are copied after the builtin static files,\n# so a file named \"default.css\" will overwrite the builtin \"default.css\".\nhtml_static_path = ['_static']\n\n# Add any extra paths that contain custom files (such as robots.txt or\n# .htaccess) here, relative to this directory. These files are copied\n# directly to the root of the documentation.\n#html_extra_path = []\n\n# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,\n# using the given strftime format.\n#html_last_updated_fmt = '%b %d, %Y'\n\n# If true, SmartyPants will be used to convert quotes and dashes to\n# typographically correct entities.\n#html_use_smartypants = True\n\n# Custom sidebar templates, maps document names to template names.\n#html_sidebars = {}\n\n# Additional templates that should be rendered to pages, maps page names to\n# template names.\n#html_additional_pages = {}\n\n# If false, no module index is generated.\n#html_domain_indices = True\n\n# If false, no index is generated.\n#html_use_index = True\n\n# If true, the index is split into individual pages for each letter.\n#html_split_index = False\n\n# If true, links to the reST sources are added to the pages.\n#html_show_sourcelink = True\n\n# If true, \"Created using Sphinx\" is shown in the HTML footer. Default is True.\n#html_show_sphinx = True\n\n# If true, \"(C) Copyright ...\" is shown in the HTML footer. Default is True.\n#html_show_copyright = True\n\n# If true, an OpenSearch description file will be output, and all pages will\n# contain a <link> tag referring to it.  The value of this option must be the\n# base URL from which the finished HTML is served.\n#html_use_opensearch = ''\n\n# This is the file name suffix for HTML files (e.g. \".xhtml\").\n#html_file_suffix = None\n\n# Output file base name for HTML help builder.\nhtmlhelp_basename = 'Morfessordoc'\n\n\n# -- Options for LaTeX output ---------------------------------------------\n\nlatex_elements = {\n# The paper size ('letterpaper' or 'a4paper').\n#'papersize': 'letterpaper',\n\n# The font size ('10pt', '11pt' or '12pt').\n#'pointsize': '10pt',\n\n# Additional stuff for the LaTeX preamble.\n#'preamble': '',\n}\n\n# Grouping the document tree into LaTeX files. List of tuples\n# (source start file, target name, title,\n#  author, documentclass [howto, manual, or own class]).\nlatex_documents = [\n  ('index', 'Morfessor.tex', u'Morfessor Documentation',\n   u'Sami Virpioja and Peter Smit', 'manual'),\n]\n\n# The name of an image file (relative to this directory) to place at the top of\n# the title page.\n#latex_logo = None\n\n# For \"manual\" documents, if this is true, then toplevel headings are parts,\n# not chapters.\n#latex_use_parts = False\n\n# If true, show page references after internal links.\n#latex_show_pagerefs = False\n\n# If true, show URL addresses after external links.\n#latex_show_urls = False\n\n# Documents to append as an appendix to all manuals.\n#latex_appendices = []\n\n# If false, no module index is generated.\n#latex_domain_indices = True\n\n\n# -- Options for manual page output ---------------------------------------\n\n# One entry per manual page. List of tuples\n# (source start file, name, description, authors, manual section).\nman_pages = [\n    ('index', 'morfessor', u'Morfessor Documentation',\n     [u'Sami Virpioja and Peter Smit'], 1)\n]\n\n# If true, show URL addresses after external links.\n#man_show_urls = False\n\n\n# -- Options for Texinfo output -------------------------------------------\n\n# Grouping the document tree into Texinfo files. List of tuples\n# (source start file, target name, title, author,\n#  dir menu entry, description, category)\ntexinfo_documents = [\n  ('index', 'Morfessor', u'Morfessor Documentation',\n   u'Sami Virpioja and Peter Smit', 'Morfessor',\n   'Tool for unsupervised and semi-supervised morphological segmentation.',\n   'Miscellaneous'),\n]\n\n# Documents to append as an appendix to all manuals.\n#texinfo_appendices = []\n\n# If false, no module index is generated.\n#texinfo_domain_indices = True\n\n# How to display URL addresses: 'footnote', 'no', or 'inline'.\n#texinfo_show_urls = 'footnote'\n\n# If true, do not generate a @detailmenu in the \"Top\" node's menu.\n#texinfo_no_detailmenu = False\n"
  },
  {
    "path": "docs/source/filetypes.rst",
    "content": "Morfessor file types\n====================\n\n.. _binary-model-def:\n\nBinary model\n------------\n\n.. warning::\n\n    Pickled models are sensitive to bitrot. Sometimes incompatibilities exist\n    between Python versions that prevent loading a model stored by a different\n    version. Also, next versions of Morfessor are not guaranteed to be able to\n    load models of older versions.\n\nThe standard format for Morfessor 2.0 is a binary model, generated by pickling\nthe :ref:`BaselineModel <baseline-model-label>` object. This ensures that all\ntraining-data, annotation-data and weights are exactly the same as when the\nmodel was saved.\n\n.. _binary-reduced-model-def:\n\nReduced Binary model\n--------------------\nA reduced Morfessor model contains only that information that is necessary for\nsegmenting new words using (nbest) viterbi segmentation. Reduced binary models\nmuch smaller that the full models, but no model modificating actions can be\nperformed.\n\n.. _morfessor1-model-def:\n\nMorfessor 1.0 style text model\n------------------------------\nMorfessor 2.0 also supports the text model files that are used in Morfessor\n1.0. These files consists of one segmentation per line, preceded by a count,\nwhere the constructions are separated by ' + '.\n\nSpecification: ::\n\n    <int><space><CONSTRUCTION>[<space>+<space><CONSTRUCTION>]*\n\nExample: ::\n\n    10 kahvi + kakku\n    5 kahvi + kilo + n\n    24 kahvi + kone + emme\n\nText corpus file\n----------------\nA text corpus file is a free format text-file. All lines are split into\ncompounds using the compound-separator (default <space>). The compounds then\nare split into atoms using the atom-separator. Compounds can occur multiple\ntimes and will be counted as such.\n\nExample: ::\n\n    kavhikakku kahvikilon kahvikilon\n    kahvikoneemme kahvikakku\n\nWord list file\n--------------\nA word list corpus file contains one compound per line, possibly preceded by a\ncount. If multiple entries of the same word occur there counts are summed. If\nno count is given, a count of one is assumed (per entry).\n\nSpecification: ::\n\n    [<int><space>]<COMPOUND>\n\nExample 1: ::\n\n    10 kahvikakku\n    5 kahvikilon\n    24 kahvikoneemme\n\nExample 2: ::\n\n    kahvikakku\n    kahvikilon\n    kahvikoneemme\n\nAnnotation file\n---------------\nAn annotation file contains one compound and one or more annotations per\ncompound on each line. The separators between the annotations (default ', ')\nand between the constructions (default ' ') are configurable.\n\nSpecification: ::\n\n    <compound> <analysis1construction1>[ <analysis1constructionN>][, <analysis2construction1> [<analysis2constructionN>]*]*\n\nExample: ::\n\n    kahvikakku kahvi kakku, kahvi kak ku\n    kahvikilon kahvi kilon\n    kahvikoneemme kahvi konee mme, kah vi ko nee mme\n"
  },
  {
    "path": "docs/source/general.rst",
    "content": "General\n=======\n\n.. _morfessor-tech-report:\n\nMorfessor 2.0 Technical Report\n------------------------------\n\nThe work done in Morfessor 2.0 is described in detail in the Morfessor 2.0\nTechnical Report [TechRep]_. The report is available for download from\nhttp://urn.fi/URN:ISBN:978-952-60-5501-5.\n\n\nTerminology\n-----------\n\nUnlike previous Morfessor implementations, Morfessor 2.0 is, in\nprinciple, applicable to any string segmentation task. Thus we use\nterms that are not specific to morphological segmentation task.\n\nThe task of the algorithm is to find a set of *constructions* that\ndescribe the provided training corpus efficiently and accurately. The\ntraining corpus contains a collection of *compounds*, which are the\nlargest sequences that a single construction can hold. The smallest\npieces of constructions and compounds are called *atoms*.\n\nFor example, in morphological segmentation, compounds are word forms,\nconstructions are morphs, and atoms are characters. In chunking,\ncompounds are sentences, constructions are phrases, and atoms are\nwords.\n\nCiting\n------\n\nThe authors do kindly ask that you cite the Morfessor 2.0 techical report\n [TechRep]_ when using this tool in academic publications.\n\nIn addition, when you refer to the Morfessor algorithms, you should cite the\nrespective publications where they have been introduced. For example, the first\nMorfessor algorithm was published in [Creutz2002]_ and the semi-supervised\nextension in [Kohonen2010]_. See [TechRep]_ for further information on the\nrelevant publications.\n\n.. [TechRep] Sami Virpioja, Peter Smit, Stig-Arne Grönroos, and Mikko Kurimo. Morfessor 2.0: Python Implementation and Extensions for Morfessor Baseline. Aalto University publication series SCIENCE + TECHNOLOGY, 25/2013. Aalto University, Helsinki, 2013. ISBN 978-952-60-5501-5.\n\n.. [Creutz2002] Mathias Creutz and Krista Lagus. Unsupervised discovery of morphemes. In Proceedings of the Workshop on Morphological and Phonological Learning of ACL-02, pages 21-30, Philadelphia, Pennsylvania, 11 July, 2002. \n\n.. [Kohonen2010] Oskar Kohonen, Sami Virpioja and Krista Lagus. Semi-supervised learning of concatenative morphology. In Proceedings of the 11th Meeting of the ACL Special Interest Group on Computational Morphology and Phonology, pages 78-86, Uppsala, Sweden, July 2010. Association for Computational Linguistics.\n\n"
  },
  {
    "path": "docs/source/index.rst",
    "content": ".. Morfessor documentation master file, created by\n   sphinx-quickstart on Wed Dec  4 13:41:43 2013.\n   You can adapt this file completely to your liking, but it should at least\n   contain the root `toctree` directive.\n\nMorfessor 2.0 documentation\n=====================================\n\n.. note:: The Morfessor 2.0 documentation is still a work in progress and\n  contains some unfinished parts\n\n\nContents:\n\n.. toctree::\n   :maxdepth: 2\n\n   license\n   general\n   installation\n   filetypes\n   cmdtools\n   libinterface\n\n\nIndices and tables\n==================\n\n* :ref:`genindex`\n* :ref:`modindex`\n* :ref:`search`\n\n"
  },
  {
    "path": "docs/source/installation.rst",
    "content": "Installation instructions\n=========================\n\nMorfessor 2.0 is installed using setuptools library for Python. Morfessor can\nbe installed from the packages available on the\n`Morpho project homepage`_ and the `Morfessor Github page`_, or can be\ndirectly installed from the `Python Package Index (PyPI)`_.\n\nThe Morfessor packages are created using the current Python packaging\nstandards, as described on http://docs.python.org/install/. Morfessor packages\nare fully compatible with, and recommended to run in, virtual environments as\ndescribed on http://virtualenv.org.\n\n\n\nInstallation from tarball or zip file\n-------------------------------------\n\nThe Morfessor 2.0 tarball and zip files can be downloaded from the\n`Morpho project homepage`_ (latest stable version) or from the\n`Morfessor Github page`_  (all versions).\n\n\n\nThe tarball can be installed in two different ways. The first is to unpack the\ntarball or zip file and run::\n\n    python setup.py install\n\nA second method is to use the tool pip on the tarball or zip file directly::\n\n    pip install morfessor-VERSION.tar.gz\n\n\nInstallation from PyPI\n----------------------\n\nMorfessor 2.0 is also distributed through the `Python Package Index (PyPI)`_.\nThis means that tools like pip and easy_install can automatically download and\ninstall the latest version of Morfessor.\n\nSimply type::\n\n    pip install morfessor\n\nor::\n\n    easy_install morfessor\n\nTo install the morfessor library and tools.\n\n\n.. _Morpho project homepage: http://morpho.aalto.fi\n.. _Morfessor Github page: https://github.com/aalto-speech/morfessor/releases\n.. _Python Package Index (PyPI): https://pypi.python.org/pypi/Morfessor\n"
  },
  {
    "path": "docs/source/libinterface.rst",
    "content": "Python library interface to Morfessor\n=====================================\n\nMorfessor 2.0 contains a library interface in order to be integrated in other\npython applications. The public members are documented below and should remain\nrelatively the same between Morfessor versions. Private members are documented\nin the code and can change anytime in releases.\n\nThe classes are documented below.\n\nIO class\n--------\n.. automodule:: morfessor.io\n   :members:\n\n.. _baseline-model-label:\n\nModel classes\n-------------\n.. automodule:: morfessor.baseline\n   :members:\n\nEvaluation classes\n------------------\n.. automodule:: morfessor.evaluation\n   :members:\n\n\nCode Examples for using library interface\n=========================================\n\nSegmenting new data using an existing model\n-------------------------------------------\n::\n\n    import morfessor\n\n    io = morfessor.MorfessorIO()\n\n    model = io.read_binary_model_file('model.bin')\n\n    words = ['words', 'segmenting', 'morfessor', 'unsupervised']\n\n    for word in words:\n        print(model.viterbi_segment(word))\n\n\nTesting type vs token models\n----------------------------\n::\n\n    import morfessor\n\n    io = morfessor.MorfessorIO()\n\n    train_data = list(io.read_corpus_file('training_data'))\n\n    model_types = morfessor.BaselineModel()\n    model_logtokens = morfessor.BaselineModel()\n    model_tokens = morfessor.BaselineModel()\n\n    model_types.load_data(train_data, count_modifier=lambda x: 1)\n    def log_func(x):\n        return int(round(math.log(x + 1, 2)))\n    model_logtokens.load_data(train_data, count_modifier=log_func)\n    model_tokens.load_data(train_data)\n\n    models = [model_types, model_logtokens, model_tokens]\n\n    for model in models:\n        model.train_batch()\n\n    goldstd_data = io.read_annotations_file('gold_std')\n    ev = morfessor.MorfessorEvaluation(goldstd_data)\n    results = [ev.evaluate_model(m) for m in models]\n\n    wsr = morfessor.WilcoxonSignedRank()\n    r = wsr.significance_test(results)\n    WilcoxonSignedRank.print_table(r)\n\nThe equivalent of this on the command line would be: ::\n\n    morfessor-train -s model_types -d ones training_data\n    morfessor-train -s model_logtokens -d log training_data\n    morfessor-train -s model_tokens training_data\n\n    morfessor-evaluate gold_std morfessor-train morfessor-train morfessor-train\n\n\nTesting different amounts of supervision data\n---------------------------------------------\n\n"
  },
  {
    "path": "docs/source/license.rst",
    "content": "License\n=======\nCopyright (c) 2012-2019, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions\nare met:\n\n1. Redistributions of source code must retain the above copyright\n   notice, this list of conditions and the following disclaimer.\n2. Redistributions in binary form must reproduce the above copyright\n   notice, this list of conditions and the following disclaimer in the\n   documentation and/or other materials provided with the distribution.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS\nFOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE\nCOPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,\nINCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\nBUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\nLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\nLIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN\nANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "ez_setup.py",
    "content": "#!python\n\"\"\"Bootstrap setuptools installation\n\nIf you want to use setuptools in your package's setup.py, just include this\nfile in the same directory with it, and add this to the top of your setup.py::\n\n    from ez_setup import use_setuptools\n    use_setuptools()\n\nIf you want to require a specific version of setuptools, set a download\nmirror, or use an alternate download directory, you can do so by supplying\nthe appropriate options to ``use_setuptools()``.\n\nThis file can also be run as a script to install or upgrade setuptools.\n\"\"\"\nimport os\nimport shutil\nimport sys\nimport tempfile\nimport tarfile\nimport optparse\nimport subprocess\n\nfrom distutils import log\n\ntry:\n    from site import USER_SITE\nexcept ImportError:\n    USER_SITE = None\n\nDEFAULT_VERSION = \"0.9.6\"\nDEFAULT_URL = \"https://pypi.python.org/packages/source/s/setuptools/\"\n\ndef _python_cmd(*args):\n    args = (sys.executable,) + args\n    return subprocess.call(args) == 0\n\ndef _install(tarball, install_args=()):\n    # extracting the tarball\n    tmpdir = tempfile.mkdtemp()\n    log.warn('Extracting in %s', tmpdir)\n    old_wd = os.getcwd()\n    try:\n        os.chdir(tmpdir)\n        tar = tarfile.open(tarball)\n        _extractall(tar)\n        tar.close()\n\n        # going in the directory\n        subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])\n        os.chdir(subdir)\n        log.warn('Now working in %s', subdir)\n\n        # installing\n        log.warn('Installing Setuptools')\n        if not _python_cmd('setup.py', 'install', *install_args):\n            log.warn('Something went wrong during the installation.')\n            log.warn('See the error message above.')\n            # exitcode will be 2\n            return 2\n    finally:\n        os.chdir(old_wd)\n        shutil.rmtree(tmpdir)\n\n\ndef _build_egg(egg, tarball, to_dir):\n    # extracting the tarball\n    tmpdir = tempfile.mkdtemp()\n    log.warn('Extracting in %s', tmpdir)\n    old_wd = os.getcwd()\n    try:\n        os.chdir(tmpdir)\n        tar = tarfile.open(tarball)\n        _extractall(tar)\n        tar.close()\n\n        # going in the directory\n        subdir = os.path.join(tmpdir, os.listdir(tmpdir)[0])\n        os.chdir(subdir)\n        log.warn('Now working in %s', subdir)\n\n        # building an egg\n        log.warn('Building a Setuptools egg in %s', to_dir)\n        _python_cmd('setup.py', '-q', 'bdist_egg', '--dist-dir', to_dir)\n\n    finally:\n        os.chdir(old_wd)\n        shutil.rmtree(tmpdir)\n    # returning the result\n    log.warn(egg)\n    if not os.path.exists(egg):\n        raise IOError('Could not build the egg.')\n\n\ndef _do_download(version, download_base, to_dir, download_delay):\n    egg = os.path.join(to_dir, 'setuptools-%s-py%d.%d.egg'\n                       % (version, sys.version_info[0], sys.version_info[1]))\n    if not os.path.exists(egg):\n        tarball = download_setuptools(version, download_base,\n                                      to_dir, download_delay)\n        _build_egg(egg, tarball, to_dir)\n    sys.path.insert(0, egg)\n    import setuptools\n    setuptools.bootstrap_install_from = egg\n\n\ndef use_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,\n                   to_dir=os.curdir, download_delay=15):\n    # making sure we use the absolute path\n    to_dir = os.path.abspath(to_dir)\n    was_imported = 'pkg_resources' in sys.modules or \\\n        'setuptools' in sys.modules\n    try:\n        import pkg_resources\n    except ImportError:\n        return _do_download(version, download_base, to_dir, download_delay)\n    try:\n        pkg_resources.require(\"setuptools>=\" + version)\n        return\n    except pkg_resources.VersionConflict:\n        e = sys.exc_info()[1]\n        if was_imported:\n            sys.stderr.write(\n            \"The required version of setuptools (>=%s) is not available,\\n\"\n            \"and can't be installed while this script is running. Please\\n\"\n            \"install a more recent version first, using\\n\"\n            \"'easy_install -U setuptools'.\"\n            \"\\n\\n(Currently using %r)\\n\" % (version, e.args[0]))\n            sys.exit(2)\n        else:\n            del pkg_resources, sys.modules['pkg_resources']    # reload ok\n            return _do_download(version, download_base, to_dir,\n                                download_delay)\n    except pkg_resources.DistributionNotFound:\n        return _do_download(version, download_base, to_dir,\n                            download_delay)\n\n\ndef download_setuptools(version=DEFAULT_VERSION, download_base=DEFAULT_URL,\n                        to_dir=os.curdir, delay=15):\n    \"\"\"Download setuptools from a specified location and return its filename\n\n    `version` should be a valid setuptools version number that is available\n    as an egg for download under the `download_base` URL (which should end\n    with a '/'). `to_dir` is the directory where the egg will be downloaded.\n    `delay` is the number of seconds to pause before an actual download\n    attempt.\n    \"\"\"\n    # making sure we use the absolute path\n    to_dir = os.path.abspath(to_dir)\n    try:\n        from urllib.request import urlopen\n    except ImportError:\n        from urllib2 import urlopen\n    tgz_name = \"setuptools-%s.tar.gz\" % version\n    url = download_base + tgz_name\n    saveto = os.path.join(to_dir, tgz_name)\n    src = dst = None\n    if not os.path.exists(saveto):  # Avoid repeated downloads\n        try:\n            log.warn(\"Downloading %s\", url)\n            src = urlopen(url)\n            # Read/write all in one block, so we don't create a corrupt file\n            # if the download is interrupted.\n            data = src.read()\n            dst = open(saveto, \"wb\")\n            dst.write(data)\n        finally:\n            if src:\n                src.close()\n            if dst:\n                dst.close()\n    return os.path.realpath(saveto)\n\n\ndef _extractall(self, path=\".\", members=None):\n    \"\"\"Extract all members from the archive to the current working\n       directory and set owner, modification time and permissions on\n       directories afterwards. `path' specifies a different directory\n       to extract to. `members' is optional and must be a subset of the\n       list returned by getmembers().\n    \"\"\"\n    import copy\n    import operator\n    from tarfile import ExtractError\n    directories = []\n\n    if members is None:\n        members = self\n\n    for tarinfo in members:\n        if tarinfo.isdir():\n            # Extract directories with a safe mode.\n            directories.append(tarinfo)\n            tarinfo = copy.copy(tarinfo)\n            tarinfo.mode = 448  # decimal for oct 0700\n        self.extract(tarinfo, path)\n\n    # Reverse sort directories.\n    if sys.version_info < (2, 4):\n        def sorter(dir1, dir2):\n            return cmp(dir1.name, dir2.name)\n        directories.sort(sorter)\n        directories.reverse()\n    else:\n        directories.sort(key=operator.attrgetter('name'), reverse=True)\n\n    # Set correct owner, mtime and filemode on directories.\n    for tarinfo in directories:\n        dirpath = os.path.join(path, tarinfo.name)\n        try:\n            self.chown(tarinfo, dirpath)\n            self.utime(tarinfo, dirpath)\n            self.chmod(tarinfo, dirpath)\n        except ExtractError:\n            e = sys.exc_info()[1]\n            if self.errorlevel > 1:\n                raise\n            else:\n                self._dbg(1, \"tarfile: %s\" % e)\n\n\ndef _build_install_args(options):\n    \"\"\"\n    Build the arguments to 'python setup.py install' on the setuptools package\n    \"\"\"\n    install_args = []\n    if options.user_install:\n        if sys.version_info < (2, 6):\n            log.warn(\"--user requires Python 2.6 or later\")\n            raise SystemExit(1)\n        install_args.append('--user')\n    return install_args\n\ndef _parse_args():\n    \"\"\"\n    Parse the command line for options\n    \"\"\"\n    parser = optparse.OptionParser()\n    parser.add_option(\n        '--user', dest='user_install', action='store_true', default=False,\n        help='install in user site package (requires Python 2.6 or later)')\n    parser.add_option(\n        '--download-base', dest='download_base', metavar=\"URL\",\n        default=DEFAULT_URL,\n        help='alternative URL from where to download the setuptools package')\n    options, args = parser.parse_args()\n    # positional arguments are ignored\n    return options\n\ndef main(version=DEFAULT_VERSION):\n    \"\"\"Install or upgrade setuptools and EasyInstall\"\"\"\n    options = _parse_args()\n    tarball = download_setuptools(download_base=options.download_base)\n    return _install(tarball, _build_install_args(options))\n\nif __name__ == '__main__':\n    sys.exit(main())\n"
  },
  {
    "path": "morfessor/__init__.py",
    "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n\"\"\"\nMorfessor 2.0 - Python implementation of the Morfessor method\n\"\"\"\nimport logging\n\n\n__all__ = ['MorfessorException', 'ArgumentException', 'MorfessorIO',\n           'BaselineModel', 'main', 'get_default_argparser', 'main_evaluation',\n           'get_evaluation_argparser']\n\n__version__ = '2.0.6'\n__author__ = 'Sami Virpioja, Peter Smit, Stig-Arne Grönroos'\n__author_email__ = \"morpho@aalto.fi\"\n\n_logger = logging.getLogger(__name__)\n\n\ndef get_version():\n    return __version__\n\n# The public api imports need to be at the end of the file,\n# so that the package global names are available to the modules\n# when they are imported.\n\nfrom .baseline import BaselineModel, FixedCorpusWeight, AnnotationCorpusWeight, \\\n    NumMorphCorpusWeight, MorphLengthCorpusWeight\nfrom .cmd import main, get_default_argparser, main_evaluation, \\\n    get_evaluation_argparser\nfrom .exception import MorfessorException, ArgumentException\nfrom .io import MorfessorIO\nfrom .utils import _progress\nfrom .evaluation import MorfessorEvaluation, MorfessorEvaluationResult\n"
  },
  {
    "path": "morfessor/baseline.py",
    "content": "import collections\nimport heapq\nimport logging\nimport math\nimport numbers\nimport random\nimport re\n\nfrom .utils import _progress, _is_string\nfrom .exception import MorfessorException, SegmentOnlyModelException\n\n_logger = logging.getLogger(__name__)\n\n\ndef _constructions_to_str(constructions):\n    \"\"\"Return a readable string for a list of constructions.\"\"\"\n    if _is_string(constructions[0]):\n        # Constructions are strings\n        return ' + '.join(constructions)\n    else:\n        # Constructions are not strings (should be tuples of strings)\n        return ' + '.join(map(lambda x: ' '.join(x), constructions))\n\n\n# rcount = root count (from corpus)\n# count = total count of the node\n# splitloc = integer or tuple. Location(s) of the possible splits for virtual\n#            constructions; empty tuple or 0 if real construction\nConstrNode = collections.namedtuple('ConstrNode',\n                                    ['rcount', 'count', 'splitloc'])\n\n\nclass BaselineModel(object):\n    \"\"\"Morfessor Baseline model class.\n\n    Implements training of and segmenting with a Morfessor model. The model\n    is complete agnostic to whether it is used with lists of strings (finding\n    phrases in sentences) or strings of characters (finding morphs in words).\n\n    \"\"\"\n\n    penalty = -9999.9\n\n    def __init__(self, forcesplit_list=None, corpusweight=None,\n                 use_skips=False, nosplit_re=None):\n        \"\"\"Initialize a new model instance.\n\n        Arguments:\n            forcesplit_list: force segmentations on the characters in\n                               the given list\n            corpusweight: weight for the corpus cost\n            use_skips: randomly skip frequently occurring constructions\n                         to speed up training\n            nosplit_re: regular expression string for preventing splitting\n                          in certain contexts\n\n        \"\"\"\n\n        # In analyses for each construction a ConstrNode is stored. All\n        # training data has a rcount (real count) > 0. All real morphemes\n        # have no split locations.\n        self._analyses = {}\n\n        # Flag to indicate the model is only useful for segmentation\n        self._segment_only = False\n\n        # Cost variables\n        self._lexicon_coding = LexiconEncoding()\n        self._corpus_coding = CorpusEncoding(self._lexicon_coding)\n        self._annot_coding = None\n\n        #Set corpus weight updater\n        self.set_corpus_weight_updater(corpusweight)\n\n        # Configuration variables\n        self._use_skips = use_skips  # Random skips for frequent constructions\n        self._supervised = False\n\n        # Counter for random skipping\n        self._counter = collections.Counter()\n        if forcesplit_list is None:\n            self.forcesplit_list = []\n        else:\n            self.forcesplit_list = forcesplit_list\n        if nosplit_re is None:\n            self.nosplit_re = None\n        else:\n            self.nosplit_re = re.compile(nosplit_re, re.UNICODE)\n\n        # Used only for (semi-)supervised learning\n        self.annotations = None\n\n    def set_corpus_weight_updater(self, corpus_weight):\n        if corpus_weight is None:\n            self._corpus_weight_updater = FixedCorpusWeight(1.0)\n        elif isinstance(corpus_weight, numbers.Number):\n            self._corpus_weight_updater = FixedCorpusWeight(corpus_weight)\n        else:\n            self._corpus_weight_updater = corpus_weight\n\n        self._corpus_weight_updater.update(self, 0)\n\n    def _check_segment_only(self):\n        if self._segment_only:\n            raise SegmentOnlyModelException()\n\n    @property\n    def tokens(self):\n        \"\"\"Return the number of construction tokens.\"\"\"\n        return self._corpus_coding.tokens\n\n    @property\n    def types(self):\n        \"\"\"Return the number of construction types.\"\"\"\n        return self._corpus_coding.types - 1  # do not include boundary\n\n    def _add_compound(self, compound, c):\n        \"\"\"Add compound with count c to data.\"\"\"\n        self._corpus_coding.boundaries += c\n        self._modify_construction_count(compound, c)\n        oldrc = self._analyses[compound].rcount\n        self._analyses[compound] = \\\n            self._analyses[compound]._replace(rcount=oldrc + c)\n\n    def _remove(self, construction):\n        \"\"\"Remove construction from model.\"\"\"\n        rcount, count, splitloc = self._analyses[construction]\n        self._modify_construction_count(construction, -count)\n        return rcount, count\n\n    def _random_split(self, compound, threshold):\n        \"\"\"Return a random split for compound.\n\n        Arguments:\n            compound: compound to split\n            threshold: probability of splitting at each position\n\n        \"\"\"\n        splitloc = tuple(i for i in range(1, len(compound))\n                         if random.random() < threshold)\n        return self._splitloc_to_segmentation(compound, splitloc)\n\n    def _set_compound_analysis(self, compound, parts, ptype='rbranch'):\n        \"\"\"Set analysis of compound to according to given segmentation.\n\n        Arguments:\n            compound: compound to split\n            parts: desired constructions of the compound\n            ptype: type of the parse tree to use\n\n        If ptype is 'rbranch', the analysis is stored internally as a\n        right-branching tree. If ptype is 'flat', the analysis is stored\n        directly to the compound's node.\n\n        \"\"\"\n        if len(parts) == 1:\n            rcount, count = self._remove(compound)\n            self._analyses[compound] = ConstrNode(rcount, 0, tuple())\n            self._modify_construction_count(compound, count)\n        elif ptype == 'flat':\n            rcount, count = self._remove(compound)\n            splitloc = self.segmentation_to_splitloc(parts)\n            self._analyses[compound] = ConstrNode(rcount, count, splitloc)\n            for constr in parts:\n                self._modify_construction_count(constr, count)\n        elif ptype == 'rbranch':\n            construction = compound\n            for p in range(len(parts)):\n                rcount, count = self._remove(construction)\n                prefix = parts[p]\n                if p == len(parts) - 1:\n                    self._analyses[construction] = ConstrNode(rcount, 0,\n                                                              0)\n                    self._modify_construction_count(construction, count)\n                else:\n                    suffix = self._join_constructions(parts[p + 1:])\n                    self._analyses[construction] = ConstrNode(rcount, count,\n                                                              len(prefix))\n                    self._modify_construction_count(prefix, count)\n                    self._modify_construction_count(suffix, count)\n                    construction = suffix\n        else:\n            raise MorfessorException(\"Unknown parse type '%s'\" % ptype)\n\n    def _update_annotation_choices(self):\n        \"\"\"Update the selection of alternative analyses in annotations.\n\n        For semi-supervised models, select the most likely alternative\n        analyses included in the annotations of the compounds.\n\n        \"\"\"\n        if not self._supervised:\n            return\n\n        # Collect constructions from the most probable segmentations\n        # and add missing compounds also to the unannotated data\n        constructions = collections.Counter()\n        for compound, alternatives in self.annotations.items():\n            if compound not in self._analyses:\n                self._add_compound(compound, 1)\n\n            analysis, cost = self._best_analysis(alternatives)\n            for m in analysis:\n                constructions[m] += self._analyses[compound].rcount\n\n        # Apply the selected constructions in annotated corpus coding\n        self._annot_coding.set_constructions(constructions)\n        for m, f in constructions.items():\n            count = 0\n            if m in self._analyses and not self._analyses[m].splitloc:\n                count = self._analyses[m].count\n            self._annot_coding.set_count(m, count)\n\n    def _best_analysis(self, choices):\n        \"\"\"Select the best analysis out of the given choices.\"\"\"\n        bestcost = None\n        bestanalysis = None\n        for analysis in choices:\n            cost = 0.0\n            for m in analysis:\n                if m in self._analyses and not self._analyses[m].splitloc:\n                    cost += (math.log(self._corpus_coding.tokens) -\n                             math.log(self._analyses[m].count))\n                else:\n                    cost -= self.penalty  # penalty is negative\n            if bestcost is None or cost < bestcost:\n                bestcost = cost\n                bestanalysis = analysis\n        return bestanalysis, bestcost\n\n    def _force_split(self, compound):\n        \"\"\"Return forced split of the compound.\"\"\"\n        if len(self.forcesplit_list) == 0:\n            return [compound]\n        clen = len(compound)\n        j = 0\n        parts = []\n        for i in range(0, clen):\n            if compound[i] in self.forcesplit_list:\n                if len(compound[j:i]) > 0:\n                    parts.append(compound[j:i])\n                parts.append(compound[i:i + 1])\n                j = i + 1\n        if j < clen:\n            parts.append(compound[j:])\n        return [p for p in parts if len(p) > 0]\n\n    def _test_skip(self, construction):\n        \"\"\"Return true if construction should be skipped.\"\"\"\n        if construction in self._counter:\n            t = self._counter[construction]\n            if random.random() > 1.0 / max(1, t):\n                return True\n        self._counter[construction] += 1\n        return False\n\n    def _viterbi_optimize(self, compound, addcount=0, maxlen=30):\n        \"\"\"Optimize segmentation of the compound using the Viterbi algorithm.\n\n        Arguments:\n          compound: compound to optimize\n          addcount: constant for additive smoothing of Viterbi probs\n          maxlen: maximum length for a construction\n\n        Returns list of segments.\n\n        \"\"\"\n        clen = len(compound)\n        if clen == 1:  # Single atom\n            return [compound]\n        if self._use_skips and self._test_skip(compound):\n            return self.segment(compound)\n        # Collect forced subsegments\n        parts = self._force_split(compound)\n        # Use Viterbi algorithm to optimize the subsegments\n        constructions = []\n        for part in parts:\n            constructions += self.viterbi_segment(part, addcount=addcount,\n                                                  maxlen=maxlen)[0]\n        self._set_compound_analysis(compound, constructions, ptype='flat')\n        return constructions\n\n    def _recursive_optimize(self, compound):\n        \"\"\"Optimize segmentation of the compound using recursive splitting.\n\n        Returns list of segments.\n\n        \"\"\"\n        if len(compound) == 1:  # Single atom\n            return [compound]\n        if self._use_skips and self._test_skip(compound):\n            return self.segment(compound)\n        # Collect forced subsegments\n        parts = self._force_split(compound)\n        if len(parts) == 1:\n            # just one part\n            return self._recursive_split(compound)\n        self._set_compound_analysis(compound, parts)\n        # Use recursive algorithm to optimize the subsegments\n        constructions = []\n        for part in parts:\n            constructions += self._recursive_split(part)\n        return constructions\n\n    def _recursive_split(self, construction):\n        \"\"\"Optimize segmentation of the construction by recursive splitting.\n\n        Returns list of segments.\n\n        \"\"\"\n        if len(construction) == 1:  # Single atom\n            return [construction]\n        if self._use_skips and self._test_skip(construction):\n            return self.segment(construction)\n        rcount, count = self._remove(construction)\n\n        # Check all binary splits and no split\n        self._modify_construction_count(construction, count)\n        mincost = self.get_cost()\n        self._modify_construction_count(construction, -count)\n        splitloc = 0\n        for i in range(1, len(construction)):\n            if (self.nosplit_re and\n                    self.nosplit_re.match(construction[(i - 1):(i + 1)])):\n                continue\n            prefix = construction[:i]\n            suffix = construction[i:]\n            self._modify_construction_count(prefix, count)\n            self._modify_construction_count(suffix, count)\n            cost = self.get_cost()\n            self._modify_construction_count(prefix, -count)\n            self._modify_construction_count(suffix, -count)\n            if cost <= mincost:\n                mincost = cost\n                splitloc = i\n\n        if splitloc:\n            # Virtual construction\n            self._analyses[construction] = ConstrNode(rcount, count,\n                                                      splitloc)\n            prefix = construction[:splitloc]\n            suffix = construction[splitloc:]\n            self._modify_construction_count(prefix, count)\n            self._modify_construction_count(suffix, count)\n            lp = self._recursive_split(prefix)\n            if suffix != prefix:\n                return lp + self._recursive_split(suffix)\n            else:\n                return lp + lp\n        else:\n            # Real construction\n            self._analyses[construction] = ConstrNode(rcount, 0, tuple())\n            self._modify_construction_count(construction, count)\n            return [construction]\n\n    def _modify_construction_count(self, construction, dcount):\n        \"\"\"Modify the count of construction by dcount.\n\n        For virtual constructions, recurses to child nodes in the\n        tree. For real constructions, adds/removes construction\n        to/from the lexicon whenever necessary.\n\n        \"\"\"\n        if construction in self._analyses:\n            rcount, count, splitloc = self._analyses[construction]\n        else:\n            rcount, count, splitloc = 0, 0, 0\n        newcount = count + dcount\n        if newcount == 0:\n            del self._analyses[construction]\n        else:\n            self._analyses[construction] = ConstrNode(rcount, newcount,\n                                                      splitloc)\n        if splitloc:\n            # Virtual construction\n            children = self._splitloc_to_segmentation(construction, splitloc)\n            for child in children:\n                self._modify_construction_count(child, dcount)\n        else:\n            # Real construction\n            self._corpus_coding.update_count(construction, count, newcount)\n            if self._supervised:\n                self._annot_coding.update_count(construction, count, newcount)\n\n            if count == 0 and newcount > 0:\n                self._lexicon_coding.add(construction)\n            elif count > 0 and newcount == 0:\n                self._lexicon_coding.remove(construction)\n\n    def _epoch_update(self, epoch_num):\n        \"\"\"Do model updates that are necessary between training epochs.\n\n        The argument is the number of training epochs finished.\n\n        In practice, this does two things:\n        - If random skipping is in use, reset construction counters.\n        - If semi-supervised learning is in use and there are alternative\n          analyses in the annotated data, select the annotations that are\n          most likely given the model parameters. If not hand-set, update\n          the weight of the annotated corpus.\n\n        This method should also be run prior to training (with the\n        epoch number argument as 0).\n\n        \"\"\"\n        forced_epochs = 0\n        if self._corpus_weight_updater.update(self, epoch_num):\n            forced_epochs += 2\n\n        if self._use_skips:\n            self._counter = collections.Counter()\n        if self._supervised:\n            self._update_annotation_choices()\n            self._annot_coding.update_weight()\n\n        return forced_epochs\n\n    @staticmethod\n    def segmentation_to_splitloc(constructions):\n        \"\"\"Return a list of split locations for a segmented compound.\"\"\"\n        splitloc = []\n        i = 0\n        for c in constructions:\n            i += len(c)\n            splitloc.append(i)\n        return tuple(splitloc[:-1])\n\n    @staticmethod\n    def _splitloc_to_segmentation(compound, splitloc):\n        \"\"\"Return segmentation corresponding to the list of split locations.\"\"\"\n        if isinstance(splitloc, numbers.Number):\n            return [compound[:splitloc], compound[splitloc:]]\n        parts = []\n        startpos = 0\n        endpos = 0\n        for i in range(len(splitloc)):\n            endpos = splitloc[i]\n            parts.append(compound[startpos:endpos])\n            startpos = endpos\n        parts.append(compound[endpos:])\n        return parts\n\n    @staticmethod\n    def _join_constructions(constructions):\n        \"\"\"Append the constructions after each other by addition. Works for\n        both lists and strings \"\"\"\n        result = type(constructions[0])()\n        for c in constructions:\n            result += c\n        return result\n\n    def get_compounds(self):\n        \"\"\"Return the compound types stored by the model.\"\"\"\n        self._check_segment_only()\n        return [w for w, node in self._analyses.items()\n                if node.rcount > 0]\n\n    def get_constructions(self):\n        \"\"\"Return a list of the present constructions and their counts.\"\"\"\n        return sorted((c, node.count) for c, node in self._analyses.items()\n                      if not node.splitloc)\n\n    def get_cost(self):\n        \"\"\"Return current model encoding cost.\"\"\"\n        cost = self._corpus_coding.get_cost() + self._lexicon_coding.get_cost()\n        if self._supervised:\n            return cost + self._annot_coding.get_cost()\n        else:\n            return cost\n\n    def get_segmentations(self):\n        \"\"\"Retrieve segmentations for all compounds encoded by the model.\"\"\"\n        self._check_segment_only()\n        for w in sorted(self._analyses.keys()):\n            c = self._analyses[w].rcount\n            if c > 0:\n                yield c, w, self.segment(w)\n\n    def load_data(self, data, freqthreshold=1, count_modifier=None,\n                  init_rand_split=None):\n        \"\"\"Load data to initialize the model for batch training.\n\n        Arguments:\n            data: iterator of (count, compound_atoms) tuples\n            freqthreshold: discard compounds that occur less than\n                             given times in the corpus (default 1)\n            count_modifier: function for adjusting the counts of each\n                              compound\n            init_rand_split: If given, random split the word with\n                               init_rand_split as the probability for each\n                               split\n\n        Adds the compounds in the corpus to the model lexicon. Returns\n        the total cost.\n\n        \"\"\"\n        self._check_segment_only()\n        totalcount = collections.Counter()\n        for count, atoms in data:\n            if len(atoms) > 0:\n                totalcount[atoms] += count\n\n        for atoms, count in totalcount.items():\n            if count < freqthreshold:\n                continue\n            if count_modifier is not None:\n                self._add_compound(atoms, count_modifier(count))\n            else:\n                self._add_compound(atoms, count)\n\n            if init_rand_split is not None and init_rand_split > 0:\n                parts = self._random_split(atoms, init_rand_split)\n                self._set_compound_analysis(atoms, parts)\n\n        return self.get_cost()\n\n    def load_segmentations(self, segmentations):\n        \"\"\"Load model from existing segmentations.\n\n        The argument should be an iterator providing a count, a\n        compound, and its segmentation.\n\n        \"\"\"\n        self._check_segment_only()\n        for count, compound, segmentation in segmentations:\n            self._add_compound(compound, count)\n            self._set_compound_analysis(compound, segmentation)\n\n    def set_annotations(self, annotations, annotatedcorpusweight=None):\n        \"\"\"Prepare model for semi-supervised learning with given\n         annotations.\n\n         \"\"\"\n        self._check_segment_only()\n        self._supervised = True\n        self.annotations = annotations\n        self._annot_coding = AnnotatedCorpusEncoding(self._corpus_coding,\n                                                     weight=\n                                                     annotatedcorpusweight)\n        self._annot_coding.boundaries = len(self.annotations)\n\n    def segment(self, compound):\n        \"\"\"Segment the compound by looking it up in the model analyses.\n\n        Raises KeyError if compound is not present in the training\n        data. For segmenting new words, use viterbi_segment(compound).\n\n        \"\"\"\n        self._check_segment_only()\n        rcount, count, splitloc = self._analyses[compound]\n        constructions = []\n        if splitloc:\n            for child in self._splitloc_to_segmentation(compound,\n                                                        splitloc):\n                constructions += self.segment(child)\n        else:\n            constructions.append(compound)\n        return constructions\n\n    def train_batch(self, algorithm='recursive', algorithm_params=(),\n                    finish_threshold=0.005, max_epochs=None):\n        \"\"\"Train the model in batch fashion.\n\n        The model is trained with the data already loaded into the model (by\n        using an existing model or calling one of the load_ methods).\n\n        In each iteration (epoch) all compounds in the training data are\n        optimized once, in a random order. If applicable, corpus weight,\n        annotation cost, and random split counters are recalculated after\n        each iteration.\n\n        Arguments:\n            algorithm: string in ('recursive', 'viterbi') that indicates\n                         the splitting algorithm used.\n            algorithm_params: parameters passed to the splitting algorithm.\n            finish_threshold: the stopping threshold. Training stops when\n                                the improvement of the last iteration is\n                                smaller then finish_threshold * #boundaries\n            max_epochs: maximum number of epochs to train\n\n        \"\"\"\n        epochs = 0\n        forced_epochs = max(1, self._epoch_update(epochs))\n        newcost = self.get_cost()\n        compounds = list(self.get_compounds())\n        _logger.info(\"Compounds in training data: %s types / %s tokens\",\n                     len(compounds), self._corpus_coding.boundaries)\n\n        _logger.info(\"Starting batch training\")\n        _logger.info(\"Epochs: %s\\tCost: %s\", epochs, newcost)\n\n        while True:\n            # One epoch\n            random.shuffle(compounds)\n\n            for w in _progress(compounds):\n                if algorithm == 'recursive':\n                    segments = self._recursive_optimize(w, *algorithm_params)\n                elif algorithm == 'viterbi':\n                    segments = self._viterbi_optimize(w, *algorithm_params)\n                else:\n                    raise MorfessorException(\"unknown algorithm '%s'\" %\n                                             algorithm)\n                _logger.debug(\"#%s -> %s\", w, _constructions_to_str(segments))\n            epochs += 1\n\n            _logger.debug(\"Cost before epoch update: %s\", self.get_cost())\n            forced_epochs = max(forced_epochs, self._epoch_update(epochs))\n            oldcost = newcost\n            newcost = self.get_cost()\n\n            _logger.info(\"Epochs: %s\\tCost: %s\", epochs, newcost)\n            if (forced_epochs == 0 and\n                    newcost >= oldcost - finish_threshold *\n                    self._corpus_coding.boundaries):\n                break\n            if forced_epochs > 0:\n                forced_epochs -= 1\n            if max_epochs is not None and epochs >= max_epochs:\n                _logger.info(\"Max number of epochs reached, stop training\")\n                break\n        _logger.info(\"Done.\")\n        return epochs, newcost\n\n    def train_online(self, data, count_modifier=None, epoch_interval=10000,\n                     algorithm='recursive', algorithm_params=(),\n                     init_rand_split=None, max_epochs=None):\n        \"\"\"Train the model in online fashion.\n\n        The model is trained with the data provided in the data argument.\n        As example the data could come from a generator linked to standard in\n        for live monitoring of the splitting.\n\n        All compounds from data are only optimized once. After online\n        training, batch training could be used for further optimization.\n\n        Epochs are defined as a fixed number of compounds. After each epoch (\n        like in batch training), the annotation cost, and random split counters\n        are recalculated if applicable.\n\n        Arguments:\n            data: iterator of (_, compound_atoms) tuples. The first\n                    argument is ignored, as every occurence of the\n                    compound is taken with count 1\n            count_modifier: function for adjusting the counts of each\n                              compound\n            epoch_interval: number of compounds to process before starting\n                              a new epoch\n            algorithm: string in ('recursive', 'viterbi') that indicates\n                         the splitting algorithm used.\n            algorithm_params: parameters passed to the splitting algorithm.\n            init_rand_split: probability for random splitting a compound to\n                               at any point for initializing the model. None\n                               or 0 means no random splitting.\n            max_epochs: maximum number of epochs to train\n\n        \"\"\"\n        self._check_segment_only()\n        if count_modifier is not None:\n            counts = {}\n\n        _logger.info(\"Starting online training\")\n\n        epochs = 0\n        i = 0\n        more_tokens = True\n        while more_tokens:\n            self._epoch_update(epochs)\n            newcost = self.get_cost()\n            _logger.info(\"Tokens processed: %s\\tCost: %s\", i, newcost)\n\n            for _ in _progress(range(epoch_interval)):\n                try:\n                    _, w = next(data)\n                except StopIteration:\n                    more_tokens = False\n                    break\n\n                if len(w) == 0:\n                    # Newline in corpus\n                    continue\n\n                if count_modifier is not None:\n                    if w not in counts:\n                        c = 0\n                        counts[w] = 1\n                        addc = 1\n                    else:\n                        c = counts[w]\n                        counts[w] = c + 1\n                        addc = count_modifier(c + 1) - count_modifier(c)\n                    if addc > 0:\n                        self._add_compound(w, addc)\n                else:\n                    self._add_compound(w, 1)\n                if init_rand_split is not None and init_rand_split > 0:\n                    parts = self._random_split(w, init_rand_split)\n                    self._set_compound_analysis(w, parts)\n                if algorithm == 'recursive':\n                    segments = self._recursive_optimize(w, *algorithm_params)\n                elif algorithm == 'viterbi':\n                    segments = self._viterbi_optimize(w, *algorithm_params)\n                else:\n                    raise MorfessorException(\"unknown algorithm '%s'\" %\n                                             algorithm)\n                _logger.debug(\"#%s: %s -> %s\", i, w, _constructions_to_str(segments))\n                i += 1\n\n            epochs += 1\n            if max_epochs is not None and epochs >= max_epochs:\n                _logger.info(\"Max number of epochs reached, stop training\")\n                break\n\n        self._epoch_update(epochs)\n        newcost = self.get_cost()\n        _logger.info(\"Tokens processed: %s\\tCost: %s\", i, newcost)\n        return epochs, newcost\n\n    def viterbi_segment(self, compound, addcount=1.0, maxlen=30):\n        \"\"\"Find optimal segmentation using the Viterbi algorithm.\n\n        Arguments:\n          compound: compound to be segmented\n          addcount: constant for additive smoothing (0 = no smoothing)\n          maxlen: maximum length for the constructions\n\n        If additive smoothing is applied, new complex construction types can\n        be selected during the search. Without smoothing, only new\n        single-atom constructions can be selected.\n\n        Returns the most probable segmentation and its log-probability.\n\n        \"\"\"\n        clen = len(compound)\n        grid = [(0.0, None)]\n        if self._corpus_coding.tokens + self._corpus_coding.boundaries + \\\n                addcount > 0:\n            logtokens = math.log(self._corpus_coding.tokens +\n                                 self._corpus_coding.boundaries + addcount)\n        else:\n            logtokens = 0\n        badlikelihood = clen * logtokens + 1.0\n        # Viterbi main loop\n        for t in range(1, clen + 1):\n            # Select the best path to current node.\n            # Note that we can come from any node in history.\n            bestpath = None\n            bestcost = None\n            if self.nosplit_re and t < clen and \\\n                    self.nosplit_re.match(compound[(t-1):(t+1)]):\n                grid.append((clen*badlikelihood, t-1))\n                continue\n            for pt in range(max(0, t - maxlen), t):\n                if grid[pt][0] is None:\n                    continue\n                cost = grid[pt][0]\n                construction = compound[pt:t]\n                if (construction in self._analyses and\n                        not self._analyses[construction].splitloc):\n                    if self._analyses[construction].count <= 0:\n                        raise MorfessorException(\n                            \"Construction count of '%s' is %s\" %\n                            (construction,\n                             self._analyses[construction].count))\n                    cost += (logtokens -\n                             math.log(self._analyses[construction].count +\n                                      addcount))\n                elif addcount > 0:\n                    if self._corpus_coding.tokens == 0:\n                        cost += (addcount * math.log(addcount) +\n                                 self._lexicon_coding.get_codelength(\n                                     construction)\n                                 / self._corpus_coding.weight)\n                    else:\n                        cost += (logtokens - math.log(addcount) +\n                                 (((self._lexicon_coding.boundaries +\n                                    addcount) *\n                                   math.log(self._lexicon_coding.boundaries\n                                            + addcount))\n                                  - (self._lexicon_coding.boundaries\n                                     * math.log(self._lexicon_coding.boundaries))\n                                  + self._lexicon_coding.get_codelength(\n                                      construction))\n                                 / self._corpus_coding.weight)\n                elif len(construction) == 1:\n                    cost += badlikelihood\n                elif self.nosplit_re:\n                    # Some splits are forbidden, so longer unknown\n                    # constructions have to be allowed\n                    cost += len(construction) * badlikelihood\n                else:\n                    continue\n                if bestcost is None or cost < bestcost:\n                    bestcost = cost\n                    bestpath = pt\n            grid.append((bestcost, bestpath))\n        constructions = []\n        cost, path = grid[-1]\n        lt = clen + 1\n        while path is not None:\n            t = path\n            constructions.append(compound[t:lt])\n            path = grid[t][1]\n            lt = t\n        constructions.reverse()\n        # Add boundary cost\n        cost += (math.log(self._corpus_coding.tokens +\n                          self._corpus_coding.boundaries) -\n                 math.log(self._corpus_coding.boundaries))\n        return constructions, cost\n\n    def forward_logprob(self, compound):\n        \"\"\"Find log-probability of a compound using the forward algorithm.\n\n        Arguments:\n          compound: compound to process\n\n        Returns the (negative) log-probability of the compound. If the\n        probability is zero, returns a number that is larger than the\n        value defined by the penalty attribute of the model object.\n\n        \"\"\"\n        clen = len(compound)\n        grid = [0.0]\n        if self._corpus_coding.tokens + self._corpus_coding.boundaries > 0:\n            logtokens = math.log(self._corpus_coding.tokens +\n                                 self._corpus_coding.boundaries)\n        else:\n            logtokens = 0\n        # Forward main loop\n        for t in range(1, clen + 1):\n            # Sum probabilities from all paths to the current node.\n            # Note that we can come from any node in history.\n            psum = 0.0\n            for pt in range(0, t):\n                cost = grid[pt]\n                construction = compound[pt:t]\n                if (construction in self._analyses and\n                        not self._analyses[construction].splitloc):\n                    if self._analyses[construction].count <= 0:\n                        raise MorfessorException(\n                            \"Construction count of '%s' is %s\" %\n                            (construction,\n                             self._analyses[construction].count))\n                    cost += (logtokens -\n                             math.log(self._analyses[construction].count))\n                else:\n                    continue\n                psum += math.exp(-cost)\n            if psum > 0:\n                grid.append(-math.log(psum))\n            else:\n                grid.append(-self.penalty)\n        cost = grid[-1]\n        # Add boundary cost\n        cost += (math.log(self._corpus_coding.tokens +\n                          self._corpus_coding.boundaries) -\n                 math.log(self._corpus_coding.boundaries))\n        return cost\n\n    def viterbi_nbest(self, compound, n, addcount=1.0, maxlen=30):\n        \"\"\"Find top-n optimal segmentations using the Viterbi algorithm.\n\n        Arguments:\n          compound: compound to be segmented\n          n: how many segmentations to return\n          addcount: constant for additive smoothing (0 = no smoothing)\n          maxlen: maximum length for the constructions\n\n        If additive smoothing is applied, new complex construction types can\n        be selected during the search. Without smoothing, only new\n        single-atom constructions can be selected.\n\n        Returns the n most probable segmentations and their\n        log-probabilities.\n\n        \"\"\"\n        clen = len(compound)\n        grid = [[(0.0, None, None)]]\n        if self._corpus_coding.tokens + self._corpus_coding.boundaries + \\\n                addcount > 0:\n            logtokens = math.log(self._corpus_coding.tokens +\n                                 self._corpus_coding.boundaries + addcount)\n        else:\n            logtokens = 0\n        badlikelihood = clen * logtokens + 1.0\n        # Viterbi main loop\n        for t in range(1, clen + 1):\n            # Select the best path to current node.\n            # Note that we can come from any node in history.\n            bestn = []\n            if self.nosplit_re and t < clen and \\\n                    self.nosplit_re.match(compound[(t-1):(t+1)]):\n                grid.append([(-clen*badlikelihood, t-1, -1)])\n                continue\n            for pt in range(max(0, t - maxlen), t):\n                for k in range(len(grid[pt])):\n                    if grid[pt][k][0] is None:\n                        continue\n                    cost = grid[pt][k][0]\n                    construction = compound[pt:t]\n                    if (construction in self._analyses and\n                            not self._analyses[construction].splitloc):\n                        if self._analyses[construction].count <= 0:\n                            raise MorfessorException(\n                                \"Construction count of '%s' is %s\" %\n                                (construction,\n                                 self._analyses[construction].count))\n                        cost -= (logtokens -\n                                 math.log(self._analyses[construction].count +\n                                          addcount))\n                    elif addcount > 0:\n                        if self._corpus_coding.tokens == 0:\n                            cost -= (addcount * math.log(addcount) +\n                                     self._lexicon_coding.get_codelength(\n                                         construction)\n                                     / self._corpus_coding.weight)\n                        else:\n                            cost -= (logtokens - math.log(addcount) +\n                                     (((self._lexicon_coding.boundaries +\n                                        addcount) *\n                                       math.log(self._lexicon_coding.boundaries\n                                                + addcount))\n                                      - (self._lexicon_coding.boundaries\n                                         * math.log(self._lexicon_coding.\n                                                    boundaries))\n                                      + self._lexicon_coding.get_codelength(\n                                          construction))\n                                     / self._corpus_coding.weight)\n                    elif len(construction) == 1:\n                        cost -= badlikelihood\n                    elif self.nosplit_re:\n                        # Some splits are forbidden, so longer unknown\n                        # constructions have to be allowed\n                        cost -= len(construction) * badlikelihood\n                    else:\n                        continue\n                    if len(bestn) < n:\n                        heapq.heappush(bestn, (cost, pt, k))\n                    else:\n                        heapq.heappushpop(bestn, (cost, pt, k))\n            grid.append(bestn)\n        results = []\n        for k in range(len(grid[-1])):\n            constructions = []\n            cost, path, ki = grid[-1][k]\n            lt = clen + 1\n            while path is not None:\n                t = path\n                constructions.append(compound[t:lt])\n                path = grid[t][ki][1]\n                ki = grid[t][ki][2]\n                lt = t\n            constructions.reverse()\n            # Add boundary cost\n            cost -= (math.log(self._corpus_coding.tokens +\n                              self._corpus_coding.boundaries) -\n                     math.log(self._corpus_coding.boundaries))\n            results.append((-cost, constructions))\n        return [(constr, cost) for cost, constr in sorted(results)]\n\n    def get_corpus_coding_weight(self):\n        return self._corpus_coding.weight\n\n    def set_corpus_coding_weight(self, weight):\n        self._check_segment_only()\n        self._corpus_coding.weight = weight\n\n    def make_segment_only(self):\n        \"\"\"Reduce the size of this model by removing all non-morphs from the\n        analyses. After calling this method it is not possible anymore to call\n        any other method that would change the state of the model. Anyway\n        doing so would throw an exception.\n\n        \"\"\"\n        self._segment_only = True\n        self._analyses = {k: v for (k, v) in self._analyses.items()\n                          if not v.splitloc}\n\n    def clear_segmentation(self):\n        for compound in list(self.get_compounds()):\n            self._set_compound_analysis(compound, [compound])\n\n\nclass CorpusWeight(object):\n    @classmethod\n    def move_direction(cls, model, direction, epoch):\n        if direction != 0:\n            weight = model.get_corpus_coding_weight()\n            if direction > 0:\n                weight *= 1 + 2.0 / epoch\n            else:\n                weight *= 1.0 / (1 + 2.0 / epoch)\n            model.set_corpus_coding_weight(weight)\n            _logger.info(\"Corpus weight set to %s\", weight)\n            return True\n        return False\n\n\nclass FixedCorpusWeight(CorpusWeight):\n    def __init__(self, weight):\n        self.weight = weight\n\n    def update(self, model, _):\n        model.set_corpus_coding_weight(self.weight)\n        return False\n\n\nclass AnnotationCorpusWeight(CorpusWeight):\n    \"\"\"Class for using development annotations to update the corpus weight\n    during batch training\n\n    \"\"\"\n\n    def __init__(self, devel_set, threshold=0.01):\n        self.data = devel_set\n        self.threshold = threshold\n\n    def update(self, model, epoch):\n        \"\"\"Tune model corpus weight based on the precision and\n        recall of the development data, trying to keep them equal\"\"\"\n        if epoch < 1:\n            return False\n        tmp = self.data.items()\n        wlist, annotations = zip(*tmp)\n        segments = [model.viterbi_segment(w)[0] for w in wlist]\n        d = self._estimate_segmentation_dir(segments, annotations)\n\n        return self.move_direction(model, d, epoch)\n\n    @classmethod\n    def _boundary_recall(cls, prediction, reference):\n        \"\"\"Calculate average boundary recall for given segmentations.\"\"\"\n        rec_total = 0\n        rec_sum = 0.0\n        for pre_list, ref_list in zip(prediction, reference):\n            best = -1\n            for ref in ref_list:\n                # list of internal boundary positions\n                ref_b = set(BaselineModel.segmentation_to_splitloc(ref))\n                if len(ref_b) == 0:\n                    best = 1.0\n                    break\n                for pre in pre_list:\n                    pre_b = set(BaselineModel.segmentation_to_splitloc(pre))\n                    r = len(ref_b.intersection(pre_b)) / float(len(ref_b))\n                    if r > best:\n                        best = r\n            if best >= 0:\n                rec_sum += best\n                rec_total += 1\n        return rec_sum, rec_total\n\n    @classmethod\n    def _bpr_evaluation(cls, prediction, reference):\n        \"\"\"Return boundary precision, recall, and F-score for segmentations.\"\"\"\n        rec_s, rec_t = cls._boundary_recall(prediction, reference)\n        pre_s, pre_t = cls._boundary_recall(reference, prediction)\n        rec = rec_s / rec_t\n        pre = pre_s / pre_t\n        f = 2.0 * pre * rec / (pre + rec)\n        return pre, rec, f\n\n    def _estimate_segmentation_dir(self, segments, annotations):\n        \"\"\"Estimate if the given compounds are under- or oversegmented.\n\n        The decision is based on the difference between boundary precision\n        and recall values for the given sample of segmented data.\n\n        Arguments:\n          segments: list of predicted segmentations\n          annotations: list of reference segmentations\n\n        Return 1 in the case of oversegmentation, -1 in the case of\n        undersegmentation, and 0 if no changes are required.\n\n        \"\"\"\n        pre, rec, f = self._bpr_evaluation([[x] for x in segments], annotations)\n        _logger.info(\"Boundary evaluation: precision %.4f; recall %.4f\", pre, rec)\n        if abs(pre - rec) < self.threshold:\n            return 0\n        elif rec > pre:\n            return 1\n        else:\n            return -1\n\n\nclass MorphLengthCorpusWeight(CorpusWeight):\n    def __init__(self, morph_lenght, threshold=0.01):\n        self.morph_length = morph_lenght\n        self.threshold = threshold\n\n    def update(self, model, epoch):\n        if epoch < 1:\n            return False\n        cur_length = self.calc_morph_length(model)\n\n        _logger.info(\"Current morph-length: %s\", cur_length)\n\n        if (abs(self.morph_length - cur_length) / self.morph_length >\n                self.threshold):\n            d = abs(self.morph_length - cur_length) / (self.morph_length\n                                                       - cur_length)\n            return self.move_direction(model, d, epoch)\n        return False\n\n    @classmethod\n    def calc_morph_length(cls, model):\n        total_constructions = 0\n        total_atoms = 0\n        for compound in model.get_compounds():\n            constructions = model.segment(compound)\n            for construction in constructions:\n                total_constructions += 1\n                total_atoms += len(construction)\n        if total_constructions > 0:\n            return float(total_atoms) / total_constructions\n        else:\n            return 0.0\n\n\nclass NumMorphCorpusWeight(CorpusWeight):\n    def __init__(self, num_morph_types, threshold=0.01):\n        self.num_morph_types = num_morph_types\n        self.threshold = threshold\n\n    def update(self, model, epoch):\n        if epoch < 1:\n            return False\n        cur_morph_types = model._lexicon_coding.boundaries\n\n        _logger.info(\"Number of morph types: %s\", cur_morph_types)\n\n\n        if (abs(self.num_morph_types - cur_morph_types) / self.num_morph_types\n                > self.threshold):\n            d = (abs(self.num_morph_types - cur_morph_types) /\n                 (self.num_morph_types - cur_morph_types))\n            return self.move_direction(model, d, epoch)\n        return False\n\nclass Encoding(object):\n    \"\"\"Base class for calculating the entropy (encoding length) of a corpus\n    or lexicon.\n\n    Commonly subclassed to redefine specific methods.\n\n    \"\"\"\n    def __init__(self, weight=1.0):\n        \"\"\"Initizalize class\n\n        Arguments:\n            weight: weight used for this encoding\n        \"\"\"\n        self.logtokensum = 0.0\n        self.tokens = 0\n        self.boundaries = 0\n        self.weight = weight\n\n    # constant used for speeding up logfactorial calculations with Stirling's\n    # approximation\n    _log2pi = math.log(2 * math.pi)\n\n    @property\n    def types(self):\n        \"\"\"Define number of types as 0. types is made a property method to\n        ensure easy redefinition in subclasses\n\n        \"\"\"\n        return 0\n\n    @classmethod\n    def _logfactorial(cls, n):\n        \"\"\"Calculate logarithm of n!.\n\n        For large n (n > 20), use Stirling's approximation.\n\n        \"\"\"\n        if n < 2:\n            return 0.0\n        if n < 20:\n            return math.log(math.factorial(n))\n        logn = math.log(n)\n        return n * logn - n + 0.5 * (logn + cls._log2pi)\n\n    def frequency_distribution_cost(self):\n        \"\"\"Calculate -log[(u - 1)! (v - u)! / (v - 1)!]\n\n        v is the number of tokens+boundaries and u the number of types\n\n        \"\"\"\n        if self.types < 2:\n            return 0.0\n        tokens = self.tokens + self.boundaries\n        return (self._logfactorial(tokens - 1) -\n                self._logfactorial(self.types - 1) -\n                self._logfactorial(tokens - self.types))\n\n    def permutations_cost(self):\n        \"\"\"The permutations cost for the encoding.\"\"\"\n        return -self._logfactorial(self.boundaries)\n\n    def update_count(self, construction, old_count, new_count):\n        \"\"\"Update the counts in the encoding.\"\"\"\n        self.tokens += new_count - old_count\n        if old_count > 1:\n            self.logtokensum -= old_count * math.log(old_count)\n        if new_count > 1:\n            self.logtokensum += new_count * math.log(new_count)\n\n    def get_cost(self):\n        \"\"\"Calculate the cost for encoding the corpus/lexicon\"\"\"\n        if self.boundaries == 0:\n            return 0.0\n\n        n = self.tokens + self.boundaries\n        return ((n * math.log(n)\n                 - self.boundaries * math.log(self.boundaries)\n                 - self.logtokensum\n                 + self.permutations_cost()) * self.weight\n                + self.frequency_distribution_cost())\n\n\nclass CorpusEncoding(Encoding):\n    \"\"\"Encoding the corpus class\n\n    The basic difference to a normal encoding is that the number of types is\n    not stored directly but fetched from the lexicon encoding. Also does the\n    cost function not contain any permutation cost.\n    \"\"\"\n    def __init__(self, lexicon_encoding, weight=1.0):\n        super(CorpusEncoding, self).__init__(weight)\n        self.lexicon_encoding = lexicon_encoding\n\n    @property\n    def types(self):\n        \"\"\"Return the number of types of the corpus, which is the same as the\n         number of boundaries in the lexicon + 1\n\n        \"\"\"\n        return self.lexicon_encoding.boundaries + 1\n\n    def frequency_distribution_cost(self):\n        \"\"\"Calculate -log[(M - 1)! (N - M)! / (N - 1)!] for M types and N\n        tokens.\n\n        \"\"\"\n        if self.types < 2:\n            return 0.0\n        tokens = self.tokens\n        return (self._logfactorial(tokens - 1) -\n                self._logfactorial(self.types - 2) -\n                self._logfactorial(tokens - self.types + 1))\n\n    def get_cost(self):\n        \"\"\"Override for the Encoding get_cost function. A corpus does not\n        have a permutation cost\n\n        \"\"\"\n        if self.boundaries == 0:\n            return 0.0\n\n        n = self.tokens + self.boundaries\n        return ((n * math.log(n)\n                 - self.boundaries * math.log(self.boundaries)\n                 - self.logtokensum) * self.weight\n                + self.frequency_distribution_cost())\n\n\nclass AnnotatedCorpusEncoding(Encoding):\n    \"\"\"Encoding the cost of an Annotated Corpus.\n\n    In this encoding constructions that are missing are penalized.\n\n    \"\"\"\n    def __init__(self, corpus_coding, weight=None, penalty=-9999.9):\n        \"\"\"\n        Initialize encoding with appropriate meta data\n\n        Arguments:\n            corpus_coding: CorpusEncoding instance used for retrieving the\n                             number of tokens and boundaries in the corpus\n            weight: The weight of this encoding. If the weight is None,\n                      it is updated automatically to be in balance with the\n                      corpus\n            penalty: log penalty used for missing constructions\n\n        \"\"\"\n        super(AnnotatedCorpusEncoding, self).__init__()\n        self.do_update_weight = True\n        self.weight = 1.0\n        if weight is not None:\n            self.do_update_weight = False\n            self.weight = weight\n        self.corpus_coding = corpus_coding\n        self.penalty = penalty\n        self.constructions = collections.Counter()\n\n    def set_constructions(self, constructions):\n        \"\"\"Method for re-initializing the constructions. The count of the\n        constructions must still be set with a call to set_count\n\n        \"\"\"\n        self.constructions = constructions\n        self.tokens = sum(constructions.values())\n        self.logtokensum = 0.0\n\n    def set_count(self, construction, count):\n        \"\"\"Set an initial count for each construction. Missing constructions\n        are penalized\n        \"\"\"\n        annot_count = self.constructions[construction]\n        if count > 0:\n            self.logtokensum += annot_count * math.log(count)\n        else:\n            self.logtokensum += annot_count * self.penalty\n\n    def update_count(self, construction, old_count, new_count):\n        \"\"\"Update the counts in the Encoding, setting (or removing) a penalty\n         for missing constructions\n\n        \"\"\"\n        if construction in self.constructions:\n            annot_count = self.constructions[construction]\n            if old_count > 0:\n                self.logtokensum -= annot_count * math.log(old_count)\n            else:\n                self.logtokensum -= annot_count * self.penalty\n            if new_count > 0:\n                self.logtokensum += annot_count * math.log(new_count)\n            else:\n                self.logtokensum += annot_count * self.penalty\n\n    def update_weight(self):\n        \"\"\"Update the weight of the Encoding by taking the ratio of the\n        corpus boundaries and annotated boundaries\n        \"\"\"\n        if not self.do_update_weight:\n            return\n        old = self.weight\n        self.weight = (self.corpus_coding.weight *\n                       float(self.corpus_coding.boundaries) / self.boundaries)\n        if self.weight != old:\n            _logger.info(\"Corpus weight of annotated data set to %s\", self.weight)\n\n    def get_cost(self):\n        \"\"\"Return the cost of the Annotation Corpus.\"\"\"\n        if self.boundaries == 0:\n            return 0.0\n        n = self.tokens + self.boundaries\n        return ((n * math.log(self.corpus_coding.tokens +\n                              self.corpus_coding.boundaries)\n                 - self.boundaries * math.log(self.corpus_coding.boundaries)\n                 - self.logtokensum) * self.weight)\n\n\nclass LexiconEncoding(Encoding):\n    \"\"\"Class for calculating the encoding cost for the Lexicon\"\"\"\n\n    def __init__(self):\n        \"\"\"Initialize Lexcion Encoding\"\"\"\n        super(LexiconEncoding, self).__init__()\n        self.atoms = collections.Counter()\n\n    @property\n    def types(self):\n        \"\"\"Return the number of different atoms in the lexicon + 1 for the\n        compound-end-token\n\n        \"\"\"\n        return len(self.atoms) + 1\n\n    def add(self, construction):\n        \"\"\"Add a construction to the lexicon, updating automatically the\n        count for its atoms\n\n        \"\"\"\n        self.boundaries += 1\n        for atom in construction:\n            c = self.atoms[atom]\n            self.atoms[atom] = c + 1\n            self.update_count(atom, c, c + 1)\n\n    def remove(self, construction):\n        \"\"\"Remove construction from the lexicon, updating automatically the\n        count for its atoms\n\n        \"\"\"\n        self.boundaries -= 1\n        for atom in construction:\n            c = self.atoms[atom]\n            self.atoms[atom] = c - 1\n            self.update_count(atom, c, c - 1)\n\n    def get_codelength(self, construction):\n        \"\"\"Return an approximate codelength for new construction.\"\"\"\n        l = len(construction) + 1\n        cost = l * math.log(self.tokens + l)\n        cost -= math.log(self.boundaries + 1)\n        for atom in construction:\n            if atom in self.atoms:\n                c = self.atoms[atom]\n            else:\n                c = 1\n            cost -= math.log(c)\n        return cost\n"
  },
  {
    "path": "morfessor/cmd.py",
    "content": "# -*- coding: utf-8 -*-\nimport locale\nimport logging\nimport math\nimport random\nimport os.path\nimport sys\nimport time\nimport string\n\nfrom . import get_version\nfrom . import utils\nfrom .baseline import BaselineModel, AnnotationCorpusWeight, \\\n    MorphLengthCorpusWeight, NumMorphCorpusWeight, FixedCorpusWeight\nfrom .exception import ArgumentException\nfrom .io import MorfessorIO\nfrom .evaluation import MorfessorEvaluation, EvaluationConfig, \\\n    WilcoxonSignedRank, FORMAT_STRINGS\n\nPY3 = sys.version_info[0] == 3\n\n# _str is used to convert command line arguments to the right type (str for PY3, unicode for PY2\nif PY3:\n    _str = str\nelse:\n    _str = lambda x: unicode(x, encoding=locale.getpreferredencoding())\n\n_logger = logging.getLogger(__name__)\n\nLRU_MAX_SIZE = 1000000\n\n\ndef get_default_argparser():\n    import argparse\n\n    parser = argparse.ArgumentParser(\n        prog='morfessor.py',\n        description=\"\"\"\nMorfessor %s\n\nCopyright (c) 2012-2019, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions\nare met:\n\n1.  Redistributions of source code must retain the above copyright\n    notice, this list of conditions and the following disclaimer.\n\n2.  Redistributions in binary form must reproduce the above\n    copyright notice, this list of conditions and the following\n    disclaimer in the documentation and/or other materials provided\n    with the distribution.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n\"AS IS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\nLIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS\nFOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE\nCOPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,\nINCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,\nBUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;\nLOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER\nCAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\nLIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN\nANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE\nPOSSIBILITY OF SUCH DAMAGE.\n\nCommand-line arguments:\n\"\"\" % get_version(),\n        epilog=\"\"\"\nSimple usage examples (training and testing):\n\n  %(prog)s -t training_corpus.txt -s model.pickled\n  %(prog)s -l model.pickled -T test_corpus.txt -o test_corpus.segmented\n\nInteractive use (read corpus from user):\n\n  %(prog)s -m online -v 2 -t -\n\n\"\"\",\n        formatter_class=argparse.RawDescriptionHelpFormatter,\n        add_help=False)\n\n    # Options for input data files\n    add_arg = parser.add_argument_group('input data files').add_argument\n    add_arg('-l', '--load', dest=\"loadfile\", default=None, metavar='<file>',\n            help=\"load existing model from file (pickled model object)\")\n    add_arg('-L', '--load-segmentation', dest=\"loadsegfile\", default=None,\n            metavar='<file>',\n            help=\"load existing model from segmentation \"\n                 \"file (Morfessor 1.0 format)\")\n    add_arg('-t', '--traindata', dest='trainfiles', action='append',\n            default=[], metavar='<file>',\n            help=\"input corpus file(s) for training (text or bz2/gzipped text;\"\n                 \" use '-' for standard input; add several times in order to \"\n                 \"append multiple files)\")\n    add_arg('-T', '--testdata', dest='testfiles', action='append',\n            default=[], metavar='<file>',\n            help=\"input corpus file(s) to analyze (text or bz2/gzipped text;  \"\n                 \"use '-' for standard input; add several times in order to \"\n                 \"append multiple files)\")\n\n    # Options for output data files\n    add_arg = parser.add_argument_group('output data files').add_argument\n    add_arg('-o', '--output', dest=\"outfile\", default='-', metavar='<file>',\n            help=\"output file for test data results (for standard output, \"\n                 \"use '-'; default '%(default)s')\")\n    add_arg('-s', '--save', dest=\"savefile\", default=None, metavar='<file>',\n            help=\"save final model to file (pickled model object)\")\n    add_arg('-S', '--save-segmentation', dest=\"savesegfile\", default=None,\n            metavar='<file>',\n            help=\"save model segmentations to file (Morfessor 1.0 format)\")\n    add_arg('--save-reduced', dest=\"savereduced\", default=None,\n            metavar='<file>',\n            help=\"save final model to file in reduced form (pickled model \"\n            \"object). A model in reduced form can only be used for \"\n            \"segmentation of new words.\")\n    add_arg('-x', '--lexicon', dest=\"lexfile\", default=None, metavar='<file>',\n            help=\"output final lexicon to given file\")\n    add_arg('--nbest', dest=\"nbest\", default=1, type=int, metavar='<int>',\n            help=\"output n-best viterbi results\")\n\n    # Options for data formats\n    add_arg = parser.add_argument_group(\n        'data format options').add_argument\n    add_arg('-e', '--encoding', dest='encoding', metavar='<encoding>',\n            help=\"encoding of input and output files (if none is given, \"\n                 \"both the local encoding and UTF-8 are tried)\")\n    add_arg('--lowercase', dest=\"lowercase\", default=False,\n            action='store_true',\n            help=\"lowercase input data\")\n    add_arg('--traindata-list', dest=\"list\", default=False,\n            action='store_true',\n            help=\"input file(s) for batch training are lists \"\n                 \"(one compound per line, optionally count as a prefix)\")\n    add_arg('--atom-separator', dest=\"separator\", type=_str, default=None,\n            metavar='<regexp>',\n            help=\"atom separator regexp (default %(default)s)\")\n    add_arg('--compound-separator', dest=\"cseparator\", type=_str, default=r'\\s+',\n            metavar='<regexp>',\n            help=\"compound separator regexp (default '%(default)s')\")\n    add_arg('--analysis-separator', dest='analysisseparator', type=_str,\n            default=',', metavar='<str>',\n            help=\"separator for different analyses in an annotation file. Use\"\n                 \"  NONE for only allowing one analysis per line\")\n    add_arg('--output-format', dest='outputformat', type=_str,\n            default=r'{analysis}\\n', metavar='<format>',\n            help=\"format string for --output file (default: '%(default)s'). \"\n            \"Valid keywords are: \"\n            \"{analysis} = constructions of the compound, \"\n            \"{compound} = compound string, \"\n            \"{count} = count of the compound (currently always 1), \"\n            \"{logprob} = log-probability of the analysis, and \"\n            \"{clogprob} = log-probability of the compound. Valid escape \"\n            \"sequences are '\\\\n' (newline) and '\\\\t' (tabular)\")\n    add_arg('--output-format-separator', dest='outputformatseparator',\n            type=_str, default=' ', metavar='<str>',\n            help=\"construction separator for analysis in --output file \"\n            \"(default: '%(default)s')\")\n    add_arg('--output-newlines', dest='outputnewlines', default=False,\n            action='store_true',\n            help=\"for each newline in input, print newline in --output file \"\n            \"(default: '%(default)s')\")\n\n    # Options for model training\n    add_arg = parser.add_argument_group(\n        'training and segmentation options').add_argument\n    add_arg('-m', '--mode', dest=\"trainmode\", default='init+batch',\n            metavar='<mode>',\n            choices=['none', 'batch', 'init', 'init+batch', 'online',\n                     'online+batch'],\n            help=\"training mode ('none', 'init', 'batch', 'init+batch', \"\n                 \"'online', or 'online+batch'; default '%(default)s')\")\n    add_arg('-a', '--algorithm', dest=\"algorithm\", default='recursive',\n            metavar='<algorithm>', choices=['recursive', 'viterbi'],\n            help=\"algorithm type ('recursive', 'viterbi'; default \"\n                 \"'%(default)s')\")\n    add_arg('-d', '--dampening', dest=\"dampening\", type=_str, default='ones',\n            metavar='<type>', choices=['none', 'log', 'ones'],\n            help=\"frequency dampening for training data ('none', 'log', or \"\n                 \"'ones'; default '%(default)s')\")\n    add_arg('-f', '--forcesplit', dest=\"forcesplit\", type=list, default=['-'],\n            metavar='<list>',\n            help=\"force split on given atoms (default '-'). The list argument \"\n                 \"is a string of characthers, use '' for no forced splits.\")\n    add_arg('-F', '--finish-threshold', dest='finish_threshold', type=float,\n            default=0.005, metavar='<float>',\n            help=\"Stopping threshold. Training stops when \"\n                 \"the improvement of the last iteration is\"\n                 \"smaller then finish_threshold * #boundaries; \"\n                 \"(default '%(default)s')\")\n    add_arg('-r', '--randseed', dest=\"randseed\", default=None,\n            metavar='<seed>',\n            help=\"seed for random number generator\")\n    add_arg('-R', '--randsplit', dest=\"splitprob\", default=None, type=float,\n            metavar='<float>',\n            help=\"initialize new words by random splitting using the given \"\n                 \"split probability (default no splitting)\")\n    add_arg('--skips', dest=\"skips\", default=False, action='store_true',\n            help=\"use random skips for frequently seen compounds to speed up \"\n                 \"training\")\n    add_arg('--batch-minfreq', dest=\"freqthreshold\", type=int, default=1,\n            metavar='<int>',\n            help=\"compound frequency threshold for batch training (default \"\n                 \"%(default)s)\")\n    add_arg('--max-epochs', dest='maxepochs', type=int, default=None,\n            metavar='<int>',\n            help='hard maximum of epochs in training')\n    add_arg('--nosplit-re', dest=\"nosplit\", type=_str, default=None,\n            metavar='<regexp>',\n            help=\"if the expression matches the two surrounding characters, \"\n                 \"do not allow splitting (default %(default)s)\")\n    add_arg('--online-epochint', dest=\"epochinterval\", type=int,\n            default=10000, metavar='<int>',\n            help=\"epoch interval for online training (default %(default)s)\")\n    add_arg('--viterbi-smoothing', dest=\"viterbismooth\", default=0,\n            type=float, metavar='<float>',\n            help=\"additive smoothing parameter for Viterbi training \"\n                 \"and segmentation (default %(default)s)\")\n    add_arg('--viterbi-maxlen', dest=\"viterbimaxlen\", default=30,\n            type=int, metavar='<int>',\n            help=\"maximum construction length in Viterbi training \"\n                 \"and segmentation (default %(default)s)\")\n\n    # Options for corpusweight tuning\n    add_arg = parser.add_mutually_exclusive_group().add_argument\n    add_arg('-D', '--develset', dest=\"develfile\", default=None,\n            metavar='<file>',\n            help=\"load annotated data for tuning the corpus weight parameter\")\n    add_arg('--morph-length', dest='morphlength', default=None, type=float,\n            metavar='<float>',\n            help=\"tune the corpusweight to obtain the desired average morph \"\n                 \"length\")\n    add_arg('--num-morph-types', dest='morphtypes', default=None, type=float,\n            metavar='<float>',\n            help=\"tune the corpusweight to obtain the desired number of morph \"\n                 \"types\")\n\n    # Options for semi-supervised model training\n    add_arg = parser.add_argument_group(\n        'semi-supervised training options').add_argument\n    add_arg('-w', '--corpusweight', dest=\"corpusweight\", type=float,\n            default=1.0, metavar='<float>',\n            help=\"corpus weight parameter (default %(default)s); \"\n                 \"sets the initial value if other tuning options are used\")\n    add_arg('--weight-threshold', dest='threshold', default=0.01,\n            metavar='<float>', type=float,\n            help='percentual stopping threshold for corpusweight updaters')\n    add_arg('--full-retrain', dest='fullretrain', action='store_true',\n            default=False,\n            help='do a full retrain after any weights have converged')\n    add_arg('-A', '--annotations', dest=\"annofile\", default=None,\n            metavar='<file>',\n            help=\"load annotated data for semi-supervised learning\")\n    add_arg('-W', '--annotationweight', dest=\"annotationweight\",\n            type=float, default=None, metavar='<float>',\n            help=\"corpus weight parameter for annotated data (if unset, the \"\n                 \"weight is set to balance the number of tokens in annotated \"\n                 \"and unannotated data sets)\")\n\n    # Options for evaluation\n    add_arg = parser.add_argument_group('Evaluation options').add_argument\n    add_arg('-G', '--goldstandard', dest='goldstandard', default=None,\n            metavar='<file>',\n            help='If provided, evaluate the model against the gold standard')\n\n    # Options for logging\n    add_arg = parser.add_argument_group('logging options').add_argument\n    add_arg('-v', '--verbose', dest=\"verbose\", type=int, default=1,\n            metavar='<int>',\n            help=\"verbose level; controls what is written to the standard \"\n                 \"error stream or log file (default %(default)s)\")\n    add_arg('--logfile', dest='log_file', metavar='<file>',\n            help=\"write log messages to file in addition to standard \"\n                 \"error stream\")\n    add_arg('--progressbar', dest='progress', default=False,\n            action='store_true',\n            help=\"Force the progressbar to be displayed (possibly lowers the \"\n                 \"log level for the standard error stream)\")\n\n    add_arg = parser.add_argument_group('other options').add_argument\n    add_arg('-h', '--help', action='help',\n            help=\"show this help message and exit\")\n    add_arg('--version', action='version',\n            version='%(prog)s ' + get_version(),\n            help=\"show version number and exit\")\n\n    return parser\n\n\ndef initialize_logging(args):\n    \"\"\"Initialize loggers based on command line args\"\"\"\n    if args.verbose >= 2:\n        loglevel = logging.DEBUG\n    elif args.verbose >= 1:\n        loglevel = logging.INFO\n    else:\n        loglevel = logging.WARNING\n\n    rootlogger = logging.getLogger()\n    rootlogger.setLevel(logging.DEBUG)\n\n    logfile_format = '%(asctime)s %(levelname)s:%(message)s'\n    date_format = '%Y-%m-%d %H:%M:%S'\n    console_format = '%(message)s'\n\n    console_level = loglevel\n    if args.log_file is not None or (hasattr(args, 'progress') and args.progress):\n        # If logging to a file or progress bar is forced, make INFO\n        # the highest level for the error stream\n        console_level = max(loglevel, logging.INFO)\n\n    # Console handler\n    ch = logging.StreamHandler()\n    ch.setLevel(console_level)\n    ch.setFormatter(logging.Formatter(console_format))\n    rootlogger.addHandler(ch)\n\n    # FileHandler for log_file\n    if args.log_file is not None:\n        fh = logging.FileHandler(args.log_file, 'w')\n        fh.setLevel(loglevel)\n        fh.setFormatter(logging.Formatter(logfile_format, date_format))\n        rootlogger.addHandler(fh)\n\n    return console_level\n\n\n@utils.lru_cache(maxsize=LRU_MAX_SIZE)\ndef _viterbi_segment(model, atoms, smooth, maxlen):\n    return model.viterbi_segment(atoms, smooth, maxlen)\n\n\n@utils.lru_cache(maxsize=LRU_MAX_SIZE)\ndef _viterbi_nbest(model, atoms, nbest, smooth, maxlen):\n    return model.viterbi_nbest(atoms, nbest, smooth,maxlen)\n\n\ndef main(args):\n\n    console_level = initialize_logging(args)\n\n    # If debug messages are printed to screen, only warning messages\n    # (or above) should be printed to screen, or if stderr is not a\n    # tty (but a pipe or a file), don't show the progressbar\n    if (console_level != logging.INFO or\n            (hasattr(sys.stderr, 'isatty') and not sys.stderr.isatty())):\n        utils.show_progress_bar = False\n\n    # Force progress bar\n    if args.progress:\n        utils.show_progress_bar = True\n\n    if (args.loadfile is None and\n            args.loadsegfile is None and\n            len(args.trainfiles) == 0):\n        raise ArgumentException(\"either model file or training data should \"\n                                \"be defined\")\n\n    if args.randseed is not None:\n        random.seed(args.randseed)\n\n    io = MorfessorIO(encoding=args.encoding,\n                     compound_separator=args.cseparator,\n                     atom_separator=args.separator,\n                     lowercase=args.lowercase)\n\n    # Load exisiting model or create a new one\n    if args.loadfile is not None:\n        model = io.read_binary_model_file(args.loadfile)\n\n    else:\n        model = BaselineModel(forcesplit_list=args.forcesplit,\n                              corpusweight=args.corpusweight,\n                              use_skips=args.skips,\n                              nosplit_re=args.nosplit)\n\n    if args.loadsegfile is not None:\n        model.load_segmentations(io.read_segmentation_file(args.loadsegfile))\n\n    analysis_sep = (args.analysisseparator\n                    if args.analysisseparator != 'NONE' else None)\n\n    if args.annofile is not None:\n        annotations = io.read_annotations_file(args.annofile,\n                                               analysis_sep=analysis_sep)\n        model.set_annotations(annotations, args.annotationweight)\n\n    if args.develfile is not None:\n        develannots = io.read_annotations_file(args.develfile,\n                                               analysis_sep=analysis_sep)\n        updater = AnnotationCorpusWeight(develannots, args.threshold)\n        model.set_corpus_weight_updater(updater)\n\n    if args.morphlength is not None:\n        updater = MorphLengthCorpusWeight(args.morphlength, args.threshold)\n        model.set_corpus_weight_updater(updater)\n\n    if args.morphtypes is not None:\n        updater = NumMorphCorpusWeight(args.morphtypes, args.threshold)\n        model.set_corpus_weight_updater(updater)\n\n    start_corpus_weight = model.get_corpus_coding_weight()\n\n    # Set frequency dampening function\n    if args.dampening == 'none':\n        dampfunc = None\n    elif args.dampening == 'log':\n        dampfunc = lambda x: int(round(math.log(x + 1, 2)))\n    elif args.dampening == 'ones':\n        dampfunc = lambda x: 1\n    else:\n        raise ArgumentException(\"unknown dampening type '%s'\" % args.dampening)\n\n    # Set algorithm parameters\n    if args.algorithm == 'viterbi':\n        algparams = (args.viterbismooth, args.viterbimaxlen)\n    else:\n        algparams = ()\n\n    # Train model\n    if args.trainmode == 'none':\n        pass\n    elif args.trainmode == 'batch':\n        if len(model.get_compounds()) == 0:\n            _logger.warning(\"Model contains no compounds for batch training.\"\n                            \" Use 'init+batch' mode to add new data.\")\n        else:\n            if len(args.trainfiles) > 0:\n                _logger.warning(\"Training mode 'batch' ignores new data \"\n                                \"files. Use 'init+batch' or 'online' to \"\n                                \"add new compounds.\")\n            ts = time.time()\n            e, c = model.train_batch(args.algorithm, algparams,\n                                     args.finish_threshold, args.maxepochs)\n            te = time.time()\n            _logger.info(\"Epochs: %s\", e)\n            _logger.info(\"Final cost: %s\", c)\n            _logger.info(\"Training time: %.3fs\", (te - ts))\n    elif len(args.trainfiles) > 0:\n        ts = time.time()\n        if args.trainmode == 'init':\n            if args.list:\n                data = io.read_corpus_list_files(args.trainfiles)\n            else:\n                data = io.read_corpus_files(args.trainfiles)\n            c = model.load_data(data, args.freqthreshold, dampfunc,\n                                args.splitprob)\n        elif args.trainmode == 'init+batch':\n            if args.list:\n                data = io.read_corpus_list_files(args.trainfiles)\n            else:\n                data = io.read_corpus_files(args.trainfiles)\n            c = model.load_data(data, args.freqthreshold, dampfunc,\n                                args.splitprob)\n            e, c = model.train_batch(args.algorithm, algparams,\n                                     args.finish_threshold, args.maxepochs)\n            _logger.info(\"Epochs: %s\", e)\n            if args.fullretrain:\n                if abs(model.get_corpus_coding_weight() - start_corpus_weight) > 0.1:\n                    model.set_corpus_weight_updater(\n                        FixedCorpusWeight(model.get_corpus_coding_weight()))\n                    model.clear_segmentation()\n                    e, c = model.train_batch(args.algorithm, algparams,\n                                             args.finish_threshold,\n                                             args.maxepochs)\n                    _logger.info(\"Retrain Epochs: %s\", e)\n        elif args.trainmode == 'online':\n            data = io.read_corpus_files(args.trainfiles)\n            e, c = model.train_online(data, dampfunc, args.epochinterval,\n                                      args.algorithm, algparams,\n                                      args.splitprob, args.maxepochs)\n            _logger.info(\"Epochs: %s\", e)\n        elif args.trainmode == 'online+batch':\n            data = io.read_corpus_files(args.trainfiles)\n            e, c = model.train_online(data, dampfunc, args.epochinterval,\n                                      args.algorithm, algparams,\n                                      args.splitprob, args.maxepochs)\n            e, c = model.train_batch(args.algorithm, algparams,\n                                     args.finish_threshold, args.maxepochs - e)\n            _logger.info(\"Epochs: %s\", e)\n            if args.fullretrain:\n                if abs(model.get_corpus_coding_weight() - start_corpus_weight) > 0.1:\n                    model.clear_segmentation()\n                    e, c = model.train_batch(args.algorithm, algparams,\n                                             args.finish_threshold,\n                                             args.maxepochs)\n                    _logger.info(\"Retrain Epochs: %s\", e)\n        else:\n            raise ArgumentException(\"unknown training mode '%s'\", args.trainmode)\n        te = time.time()\n        _logger.info(\"Final cost: %s\", c)\n        _logger.info(\"Training time: %.3fs\", (te - ts))\n    else:\n        _logger.warning(\"No training data files specified.\")\n\n    # Save model\n    if args.savefile is not None:\n        io.write_binary_model_file(args.savefile, model)\n\n    if args.savesegfile is not None:\n        io.write_segmentation_file(args.savesegfile, model.get_segmentations())\n\n    # Output lexicon\n    if args.lexfile is not None:\n        io.write_lexicon_file(args.lexfile, model.get_constructions())\n\n    if args.savereduced is not None:\n        model.make_segment_only()\n        io.write_binary_model_file(args.savereduced, model)\n\n    # Segment test data\n    if len(args.testfiles) > 0:\n        _logger.info(\"Segmenting test data...\")\n        outformat = args.outputformat\n        csep = args.outputformatseparator\n        outformat = outformat.replace(r\"\\n\", \"\\n\")\n        outformat = outformat.replace(r\"\\t\", \"\\t\")\n        keywords = [x[1] for x in string.Formatter().parse(outformat)]\n        with io._open_text_file_write(args.outfile) as fobj:\n            testdata = io.read_corpus_files(args.testfiles)\n            i = 0\n            for count, atoms in testdata:\n                if io.atom_separator is None:\n                    compound = \"\".join(atoms)\n                else:\n                    compound = io.atom_separator.join(atoms)\n                if len(atoms) == 0:\n                    # Newline in corpus\n                    if args.outputnewlines:\n                        fobj.write(\"\\n\")\n                    continue\n                if \"clogprob\" in keywords:\n                    clogprob = model.forward_logprob(atoms)\n                else:\n                    clogprob = 0\n                if args.nbest > 1:\n                    nbestlist = _viterbi_nbest(\n                        model, atoms, args.nbest, args.viterbismooth,\n                        args.viterbimaxlen)\n                    for constructions, logp in nbestlist:\n                        analysis = io.format_constructions(constructions,\n                                                           csep=csep)\n                        fobj.write(outformat.format(analysis=analysis,\n                                                    compound=compound,\n                                                    count=count, logprob=logp,\n                                                    clogprob=clogprob))\n                else:\n                    constructions, logp = _viterbi_segment(\n                        model, atoms, args.viterbismooth, args.viterbimaxlen)\n                    analysis = io.format_constructions(constructions, csep=csep)\n                    fobj.write(outformat.format(analysis=analysis,\n                                                compound=compound,\n                                                count=count, logprob=logp,\n                                                clogprob=clogprob))\n                i += 1\n                if i % 10000 == 0:\n                    sys.stderr.write(\".\")\n            sys.stderr.write(\"\\n\")\n        _logger.info(\"Done.\")\n\n    if args.goldstandard is not None:\n        _logger.info(\"Evaluating Model\")\n        e = MorfessorEvaluation(io.read_annotations_file(args.goldstandard))\n        result = e.evaluate_model(model, meta_data={'name': 'MODEL'})\n        print(result.format(FORMAT_STRINGS['default']))\n        _logger.info(\"Done\")\n\n\ndef get_evaluation_argparser():\n    import argparse\n    #TODO factor out redundancies with get_default_argparser()\n    standard_parser = get_default_argparser()\n    parser = argparse.ArgumentParser(\n        prog=\"morfessor-evaluate\",\n        epilog=\"\"\"Simple usage example:\n\n  %(prog)s gold_standard model1 model2\n\"\"\",\n        description=standard_parser.description,\n        formatter_class=argparse.RawDescriptionHelpFormatter,\n        add_help=False\n    )\n\n    add_arg = parser.add_argument_group('evaluation options').add_argument\n    add_arg('--num-samples', dest='numsamples', type=int, metavar='<int>',\n            default=10, help='number of samples to take for testing')\n    add_arg('--sample-size', dest='samplesize', type=int, metavar='<int>',\n            default=1000, help='size of each testing samples')\n\n    add_arg = parser.add_argument_group('formatting options').add_argument\n    add_arg('--format-string', dest='formatstring', metavar='<format>',\n            help='Python new style format string used to report evaluation '\n                 'results. The following variables are a value and and action '\n                 'separated with and underscore. E.g. fscore_avg for the '\n                 'average f-score. The available values are \"precision\", '\n                 '\"recall\", \"fscore\", \"samplesize\" and the available actions: '\n                 '\"avg\", \"max\", \"min\", \"values\", \"count\". A last meta-data '\n                 'variable (without action) is \"name\", the filename of the '\n                 'model See also the format-template option for predefined '\n                 'strings')\n    add_arg('--format-template', dest='template', metavar='<template>',\n            default='default',\n            help='Uses a template string for the format-string options. '\n                 'Available templates are: default, table and latex. '\n                 'If format-string is defined this option is ignored')\n\n    add_arg = parser.add_argument_group('file options').add_argument\n    add_arg('--construction-separator', dest=\"cseparator\", type=_str,\n            default=' ', metavar='<regexp>',\n            help=\"construction separator for test segmentation files\"\n                 \" (default '%(default)s')\")\n    add_arg('-e', '--encoding', dest='encoding', metavar='<encoding>',\n            help=\"encoding of input and output files (if none is given, \"\n                 \"both the local encoding and UTF-8 are tried)\")\n\n    add_arg = parser.add_argument_group('logging options').add_argument\n    add_arg('-v', '--verbose', dest=\"verbose\", type=int, default=1,\n            metavar='<int>',\n            help=\"verbose level; controls what is written to the standard \"\n                 \"error stream or log file (default %(default)s)\")\n    add_arg('--logfile', dest='log_file', metavar='<file>',\n            help=\"write log messages to file in addition to standard \"\n                 \"error stream\")\n\n    add_arg = parser.add_argument_group('other options').add_argument\n    add_arg('-h', '--help', action='help',\n            help=\"show this help message and exit\")\n    add_arg('--version', action='version',\n            version='%(prog)s ' + get_version(),\n            help=\"show version number and exit\")\n\n    add_arg = parser.add_argument\n    add_arg('goldstandard', metavar='<goldstandard>', nargs=1,\n            help='gold standard file in standard annotation format')\n    add_arg('models', metavar='<model>', nargs='+',\n            help='model files to segment (either binary or Morfessor 1.0 style'\n                 ' segmentation models).')\n    add_arg('-t', '--testsegmentation', dest='test_segmentations', default=[],\n            action='append',\n            help='Segmentation of the test set. Note that all words in the '\n                 'gold-standard must be segmented')\n\n    return parser\n\n\ndef main_evaluation(args):\n    \"\"\" Separate main for running evaluation and statistical significance\n    testing. Takes as argument the results of an get_evaluation_argparser()\n    \"\"\"\n    initialize_logging(args)\n\n    io = MorfessorIO(encoding=args.encoding)\n\n    ev = MorfessorEvaluation(io.read_annotations_file(args.goldstandard[0]))\n\n    results = []\n\n    sample_size = args.samplesize\n    num_samples = args.numsamples\n\n    f_string = args.formatstring\n    if f_string is None:\n        f_string = FORMAT_STRINGS[args.template]\n\n    for f in args.models:\n        result = ev.evaluate_model(io.read_any_model(f),\n                                   configuration=EvaluationConfig(num_samples,\n                                                                  sample_size),\n                                   meta_data={'name': os.path.basename(f)})\n        results.append(result)\n        print(result.format(f_string))\n\n    io.construction_separator = args.cseparator\n    for f in args.test_segmentations:\n        segmentation = io.read_segmentation_file(f, False)\n        result = ev.evaluate_segmentation(segmentation,\n                                          configuration=\n                                          EvaluationConfig(num_samples,\n                                                           sample_size),\n                                          meta_data={'name':\n                                                     os.path.basename(f)})\n        results.append(result)\n        print(result.format(f_string))\n\n    if len(results) > 1 and num_samples > 1:\n        wsr = WilcoxonSignedRank()\n        r = wsr.significance_test(results)\n        WilcoxonSignedRank.print_table(r)\n"
  },
  {
    "path": "morfessor/evaluation.py",
    "content": "from __future__ import print_function\n\nimport collections\nimport logging\nfrom itertools import chain, product\nimport math\nimport random\n\n_logger = logging.getLogger(__name__)\n\nEvaluationConfig = collections.namedtuple('EvaluationConfig',\n                                          ['num_samples', 'sample_size'])\n\nFORMAT_STRINGS = {\n    'default': \"\"\"Filename   : {name}\nNum samples: {samplesize_count}\nSample size: {samplesize_avg}\nF-score    : {fscore_avg:.3}\nPrecision  : {precision_avg:.3}\nRecall     : {recall_avg:.3}\"\"\",\n    'table': \"{name:10} {precision_avg:6.3} {recall_avg:6.3} {fscore_avg:6.3}\",\n    'latex': \"{name} & {precision_avg:.3} &\"\n             \" {recall_avg:.3} & {fscore_avg:.3} \\\\\\\\\"}\n\n\ndef _sample(compound_list, size, seed):\n    \"\"\"Create a specific size sample from the compound list using a specific\n    seed\"\"\"\n    return random.Random(seed).sample(compound_list, size)\n\n\nclass MorfessorEvaluationResult(object):\n    \"\"\"A MorfessorEvaluationResult is returned by a MorfessorEvaluation\n    object. It's purpose is to store the evaluation data and provide nice\n    formatting options.\n\n    Each MorfessorEvaluationResult contains the data of 1 evaluation\n    (which can have multiple samples).\n\n    \"\"\"\n\n    print_functions = {'avg': lambda x: sum(x) / len(x),\n                       'min': min,\n                       'max': max,\n                       'values': list,\n                       'count': len}\n    #TODO add maybe std as a print function?\n\n    def __init__(self, meta_data=None):\n        self.meta_data = meta_data\n\n        self.precision = []\n        self.recall = []\n        self.fscore = []\n        self.samplesize = []\n\n        self._cache = None\n\n    def __getitem__(self, item):\n        \"\"\"Provide dict style interface for all values (standard values and\n        metadata)\"\"\"\n        if self._cache is None:\n            self._fill_cache()\n\n        return self._cache[item]\n\n    def add_data_point(self, precision, recall, f_score, sample_size):\n        \"\"\"Method used by MorfessorEvaluation to add the results of a single\n        sample to the object\"\"\"\n        self.precision.append(precision)\n        self.recall.append(recall)\n        self.fscore.append(f_score)\n        self.samplesize.append(sample_size)\n\n        #clear cache\n        self._cache = None\n\n    def __str__(self):\n        \"\"\"Method for default visualization\"\"\"\n        return self.format(FORMAT_STRINGS['default'])\n\n    def _fill_cache(self):\n        \"\"\" Pre calculate all variable / function combinations and put them in\n        cache\"\"\"\n        self._cache = {'{}_{}'.format(val, func_name): func(getattr(self, val))\n                       for val in ('precision', 'recall', 'fscore',\n                                   'samplesize')\n                       for func_name, func in self.print_functions.items()}\n        self._cache.update(self.meta_data)\n\n    def _get_cache(self):\n        \"\"\" Fill the cache (if necessary) and return it\"\"\"\n        if self._cache is None:\n            self._fill_cache()\n        return self._cache\n\n    def format(self, format_string):\n        \"\"\" Format this object. The format string can contain all variables,\n        e.g. fscore_avg, precision_values or any item from metadata\"\"\"\n        return format_string.format(**self._get_cache())\n\n\nclass MorfessorEvaluation(object):\n    \"\"\" Do the evaluation of one model, on one testset. The basic procedure is\n    to create, in a stable manner, a number of samples and evaluate them\n    independently. The stable selection of samples makes it possible to use\n    the resulting values for Pair-wise statistical significance testing.\n\n    reference_annotations is a standard annotation dictionary:\n    {compound => ([annoation1],.. ) }\n    \"\"\"\n    def __init__(self, reference_annotations):\n        self.reference = {}\n\n        for compound, analyses in reference_annotations.items():\n            self.reference[compound] = list(\n                tuple(self._segmentation_indices(a)) for a in analyses)\n\n        self._samples = {}\n\n    def _create_samples(self, configuration=EvaluationConfig(10, 1000)):\n        \"\"\"Create, in a stable manner, n testsets of size x as defined in\n        test_configuration\n        \"\"\"\n\n        #TODO: What is a reasonable limit to warn about a too small testset?\n        if len(self.reference) < (configuration.num_samples *\n                                  configuration.sample_size):\n            _logger.warning(\"The test set is too small for this sample size\")\n\n        compound_list = sorted(self.reference.keys())\n        self._samples[configuration] = [\n            _sample(compound_list, configuration.sample_size, i) for i in\n            range(configuration.num_samples)]\n\n    def get_samples(self, configuration=EvaluationConfig(10, 1000)):\n        \"\"\"Get a list of samples. A sample is a list of compounds.\n\n        This method is stable, so each time it is called with a specific\n        test_set and configuration it will return the same samples. Also this\n        method caches the samples in the _samples variable.\n\n        \"\"\"\n        if configuration not in self._samples:\n            self._create_samples(configuration)\n        return self._samples[configuration]\n\n    def _evaluate(self, prediction):\n        \"\"\"Helper method to get the precision and recall of 1 sample\"\"\"\n        def calc_prop_distance(ref, pred):\n            if len(ref) == 0:\n                return 1.0\n            diff = len(set(ref) - set(pred))\n            return (len(ref) - diff) / float(len(ref))\n\n        wordlist = sorted(set(prediction.keys()) & set(self.reference.keys()))\n\n        recall_sum = 0.0\n        precis_sum = 0.0\n\n        for word in wordlist:\n            if len(word) < 2:\n                continue\n\n            recall_sum += max(calc_prop_distance(r, p)\n                              for p, r in product(prediction[word],\n                                                  self.reference[word]))\n\n            precis_sum += max(calc_prop_distance(p, r)\n                              for p, r in product(prediction[word],\n                                                  self.reference[word]))\n\n        precision = precis_sum / len(wordlist)\n        recall = recall_sum / len(wordlist)\n        f_score = 2.0 / (1.0 / precision + 1.0 / recall)\n\n        return precision, recall, f_score, len(wordlist)\n\n    @staticmethod\n    def _segmentation_indices(annotation):\n        \"\"\"Method to transform a annotation into a tuple of split indices\"\"\"\n        cur_len = 0\n        for a in annotation[:-1]:\n            cur_len += len(a)\n            yield cur_len\n\n    def evaluate_model(self, model, configuration=EvaluationConfig(10, 1000),\n                       meta_data=None):\n        \"\"\"Get the prediction of the test samples from the model and do the\n        evaluation\n\n        The meta_data object has preferably at least the key 'name'.\n\n        \"\"\"\n        if meta_data is None:\n            meta_data = {'name': 'UNKNOWN'}\n\n        mer = MorfessorEvaluationResult(meta_data)\n\n        for i, sample in enumerate(self.get_samples(configuration)):\n            _logger.debug(\"Evaluating sample %s\", i)\n            prediction = {}\n            for compound in sample:\n                prediction[compound] = [tuple(self._segmentation_indices(\n                    model.viterbi_segment(compound)[0]))]\n\n            mer.add_data_point(*self._evaluate(prediction))\n\n        return mer\n\n    def evaluate_segmentation(self, segmentation,\n                              configuration=EvaluationConfig(10, 1000),\n                              meta_data=None):\n        \"\"\"Method for evaluating an existing segmentation\"\"\"\n\n        def merge_constructions(constructions):\n            compound = constructions[0]\n            for i in range(1, len(constructions)):\n                compound = compound + constructions[i]\n            return compound\n\n        segmentation = {merge_constructions(x[1]):\n                        [tuple(self._segmentation_indices(x[1]))]\n                        for x in segmentation}\n\n        if meta_data is None:\n            meta_data = {'name': 'UNKNOWN'}\n\n        mer = MorfessorEvaluationResult(meta_data)\n\n        for i, sample in enumerate(self.get_samples(configuration)):\n            _logger.debug(\"Evaluating sample %s\", i)\n\n            prediction = {k: v for k, v in segmentation.items() if k in sample}\n            mer.add_data_point(*self._evaluate(prediction))\n\n        return mer\n\n\nclass WilcoxonSignedRank(object):\n    \"\"\"Class for doing statistical signficance testing with the Wilcoxon\n    Signed-Rank test\n\n    It implements the Pratt method for handling zero-differences and\n    applies a 0.5 continuity correction for the z-statistic.\n\n    \"\"\"\n\n    @staticmethod\n    def _wilcoxon(d, method='pratt', correction=True):\n        if method not in ('wilcox', 'pratt'):\n            raise ValueError\n        if method == 'wilcox':\n            d = list(filter(lambda a: a != 0, d))\n\n        count = len(d)\n\n        ranks = WilcoxonSignedRank._rankdata([abs(v) for v in d])\n        rank_sum_pos = sum(r for r, v in zip(ranks, d) if v > 0)\n        rank_sum_neg = sum(r for r, v in zip(ranks, d) if v < 0)\n\n        test = min(rank_sum_neg, rank_sum_pos)\n\n        mean = count * (count + 1) * 0.25\n        stdev = (count*(count + 1) * (2 * count + 1))\n        # compensate for duplicate ranks\n        no_zero_ranks = [r for i, r in enumerate(ranks) if d[i] != 0]\n        stdev -= 0.5 * sum(x * (x*x-1) for x in\n                           collections.Counter(no_zero_ranks).values())\n\n        stdev = math.sqrt(stdev / 24.0)\n\n        if correction:\n            correction = +0.5 if test > mean else -0.5\n        else:\n            correction = 0\n        z = (test - mean - correction) / stdev\n\n        return 2 * WilcoxonSignedRank._norm_cum_pdf(abs(z))\n\n    @staticmethod\n    def _rankdata(d):\n        od = collections.Counter()\n        for v in d:\n            od[v] += 1\n\n        rank_dict = {}\n        cur_rank = 1\n        for val, count in sorted(od.items(), key=lambda x: x[0]):\n            rank_dict[val] = (cur_rank + (cur_rank + count - 1)) / 2\n            cur_rank += count\n\n        return [rank_dict[v] for v in d]\n\n    @staticmethod\n    def _norm_cum_pdf(z):\n        \"\"\"Pure python implementation of the normal cumulative pdf function\"\"\"\n        return 0.5 - 0.5 * math.erf(z / math.sqrt(2))\n\n    def significance_test(self, evaluations, val_property='fscore_values',\n                          name_property='name'):\n        \"\"\"Takes a set of evaluations (which should have the same\n        test-configuration) and calculates the p-value for the Wilcoxon signed\n        rank test\n\n        Returns a dictionary with (name1,name2) keys and p-values as values.\n        \"\"\"\n        results = {r[name_property]: r[val_property] for r in evaluations}\n        if any(len(x) < 10 for x in results.values()):\n            _logger.error(\"Too small number of samples for the Wilcoxon test\")\n            return {}\n        p = {}\n        for r1, r2 in product(results.keys(), results.keys()):\n            p[(r1, r2)] = self._wilcoxon([v1-v2\n                                          for v1, v2 in zip(results[r1],\n                                                            results[r2])])\n\n        return p\n\n    @staticmethod\n    def print_table(results):\n        \"\"\"Nicely format a results table as returned by significance_test\"\"\"\n        names = sorted(set(r[0] for r in results.keys()))\n\n        col_width = max(max(len(n) for n in names), 5)\n\n        for h in chain([\"\"], names):\n            print('{:{width}}'.format(h, width=col_width), end='|')\n        print()\n\n        for name in names:\n            print('{:{width}}'.format(name, width=col_width), end='|')\n\n            for name2 in names:\n                print('{:{width}.5}'.format(results[(name, name2)],\n                                            width=col_width), end='|')\n            print()\n"
  },
  {
    "path": "morfessor/exception.py",
    "content": "from __future__ import unicode_literals\n\n\nclass MorfessorException(Exception):\n    \"\"\"Base class for exceptions in this module.\"\"\"\n    pass\n\n\nclass ArgumentException(Exception):\n    \"\"\"Exception in command line argument parsing.\"\"\"\n    pass\n\n\nclass InvalidCategoryError(MorfessorException):\n    \"\"\"Attempt to load data using a different categorization scheme.\"\"\"\n    def __init__(self, category):\n        super(InvalidCategoryError, self).__init__(\n            self, 'This model does not recognize the category {}'.format(\n                category))\n\n\nclass InvalidOperationError(MorfessorException):\n    def __init__(self, operation, function_name):\n        super(InvalidOperationError, self).__init__(\n            self, ('This model does not have a method {}, and therefore cannot'\n                   ' perform operation \"{}\"').format(function_name, operation))\n\n\nclass UnsupportedConfigurationError(MorfessorException):\n    def __init__(self, reason):\n        super(UnsupportedConfigurationError, self).__init__(\n            self, ('This operation is not supported in this program ' +\n                   'configuration. Reason: {}.').format(reason))\n\n\nclass SegmentOnlyModelException(MorfessorException):\n    def __init__(self):\n        super(SegmentOnlyModelException, self).__init__(\n            self, 'This model has been reduced to a segment-only model')\n"
  },
  {
    "path": "morfessor/io.py",
    "content": "import bz2\nimport codecs\nimport datetime\nimport gzip\nimport locale\nimport logging\nimport re\nimport sys\n\nfrom . import get_version\nfrom . import utils\n\ntry:\n    # In Python2 import cPickle for better performance\n    import cPickle as pickle\nexcept ImportError:\n    import pickle\n\nPY3 = sys.version_info[0] == 3\n\n_logger = logging.getLogger(__name__)\n\n\nclass MorfessorIO(object):\n    \"\"\"Definition for all input and output files. Also handles all\n    encoding issues.\n\n    The only state this class has is the separators used in the data.\n    Therefore, the same class instance can be used for initializing multiple\n    files.\n\n    \"\"\"\n\n    def __init__(self, encoding=None, construction_separator=' + ',\n                 comment_start='#', compound_separator=r'\\s+',\n                 atom_separator=None, lowercase=False):\n        self.encoding = encoding\n        self.construction_separator = construction_separator\n        self.comment_start = comment_start\n        self.compound_sep_re = re.compile(compound_separator, re.UNICODE)\n        self.atom_separator = atom_separator\n        if atom_separator is not None:\n            self._atom_sep_re = re.compile(atom_separator, re.UNICODE)\n        self.lowercase = lowercase\n        self._version = get_version()\n\n    def read_segmentation_file(self, file_name, has_counts=True, **kwargs):\n        \"\"\"Read segmentation file.\n\n        File format:\n        <count> <construction1><sep><construction2><sep>...<constructionN>\n\n        \"\"\"\n        _logger.info(\"Reading segmentations from '%s'...\", file_name)\n        for line in self._read_text_file(file_name):\n            if has_counts:\n                count, compound_str = line.split(' ', 1)\n            else:\n                count, compound_str = 1, line\n            constructions = tuple(\n                self._split_atoms(constr)\n                for constr in compound_str.split(self.construction_separator))\n            if self.atom_separator is None:\n                compound = \"\".join(constructions)\n            else:\n                compound = tuple(atom for constr in constructions\n                                 for atom in constr)\n            yield int(count), compound, constructions\n        _logger.info(\"Done.\")\n\n    def write_segmentation_file(self, file_name, segmentations, **kwargs):\n        \"\"\"Write segmentation file.\n\n        File format:\n        <count> <construction1><sep><construction2><sep>...<constructionN>\n\n        \"\"\"\n        _logger.info(\"Saving segmentations to '%s'...\", file_name)\n        with self._open_text_file_write(file_name) as file_obj:\n            d = datetime.datetime.now().replace(microsecond=0)\n            file_obj.write(\"# Output from Morfessor Baseline %s, %s\\n\" %\n                           (self._version, d))\n            for count, _, segmentation in segmentations:\n                if self.atom_separator is None:\n                    s = self.construction_separator.join(segmentation)\n                else:\n                    s = self.construction_separator.join(\n                        (self.atom_separator.join(constr)\n                         for constr in segmentation))\n                file_obj.write(\"%d %s\\n\" % (count, s))\n        _logger.info(\"Done.\")\n\n    def read_corpus_files(self, file_names):\n        \"\"\"Read one or more corpus files.\n\n        Yield for each compound found (1, compound_atoms).\n\n        \"\"\"\n        for file_name in file_names:\n            for item in self.read_corpus_file(file_name):\n                yield item\n\n    def read_corpus_list_files(self, file_names):\n        \"\"\"Read one or more corpus list files.\n\n        Yield for each compound found (count, compound_atoms).\n\n        \"\"\"\n        for file_name in file_names:\n            for item in self.read_corpus_list_file(file_name):\n                yield item\n\n    def read_corpus_file(self, file_name):\n        \"\"\"Read one corpus file.\n\n        For each compound, yield (1, compound_atoms).\n        After each line, yield (0, ()).\n\n        \"\"\"\n        _logger.info(\"Reading corpus from '%s'...\", file_name)\n        for line in self._read_text_file(file_name, raw=True):\n            for compound in self.compound_sep_re.split(line):\n                if len(compound) > 0:\n                    yield 1, self._split_atoms(compound)\n            yield 0, ()\n        _logger.info(\"Done.\")\n\n    def read_corpus_list_file(self, file_name):\n        \"\"\"Read a corpus list file.\n\n        Each line has the format:\n        <count> <compound>\n\n        Yield tuples (count, compound_atoms) for each compound.\n\n        \"\"\"\n        _logger.info(\"Reading corpus from list '%s'...\", file_name)\n        for line in self._read_text_file(file_name):\n            try:\n                count, compound = line.split(None, 1)\n                yield int(count), self._split_atoms(compound)\n            except ValueError:\n                yield 1, self._split_atoms(line)\n        _logger.info(\"Done.\")\n\n    def read_annotations_file(self, file_name, construction_separator=' ',\n                              analysis_sep=','):\n        \"\"\"Read a annotations file.\n\n        Each line has the format:\n        <compound> <constr1> <constr2>... <constrN>, <constr1>...<constrN>, ...\n\n        Yield tuples (compound, list(analyses)).\n\n        \"\"\"\n        annotations = {}\n        _logger.info(\"Reading annotations from '%s'...\", file_name)\n        for line in self._read_text_file(file_name):\n            compound, analyses_line = line.split(None, 1)\n\n            if compound not in annotations:\n                annotations[compound] = []\n\n            if analysis_sep is not None:\n                for analysis in analyses_line.split(analysis_sep):\n                    analysis = analysis.strip()\n                    annotations[compound].append(\n                        analysis.strip().split(construction_separator))\n            else:\n                annotations[compound].append(\n                    analyses_line.split(construction_separator))\n\n        _logger.info(\"Done.\")\n        return annotations\n\n    def write_lexicon_file(self, file_name, lexicon):\n        \"\"\"Write to a Lexicon file all constructions and their counts.\"\"\"\n        _logger.info(\"Saving model lexicon to '%s'...\", file_name)\n        with self._open_text_file_write(file_name) as file_obj:\n            for construction, count in lexicon:\n                file_obj.write(\"%d %s\\n\" % (count, construction))\n        _logger.info(\"Done.\")\n\n    def read_binary_model_file(self, file_name):\n        \"\"\"Read a pickled model from file.\"\"\"\n        _logger.info(\"Loading model from '%s'...\", file_name)\n        model = self.read_binary_file(file_name)\n        _logger.info(\"Done.\")\n        return model\n\n    @staticmethod\n    def read_binary_file(file_name):\n        \"\"\"Read a pickled object from a file.\"\"\"\n        with open(file_name, 'rb') as fobj:\n            obj = pickle.load(fobj)\n        return obj\n\n    def write_binary_model_file(self, file_name, model):\n        \"\"\"Pickle a model to a file.\"\"\"\n        _logger.info(\"Saving model to '%s'...\", file_name)\n        self.write_binary_file(file_name, model)\n        _logger.info(\"Done.\")\n\n    @staticmethod\n    def write_binary_file(file_name, obj):\n        \"\"\"Pickle an object into a file.\"\"\"\n        with open(file_name, 'wb') as fobj:\n            pickle.dump(obj, fobj, pickle.HIGHEST_PROTOCOL)\n\n    def write_parameter_file(self, file_name, params):\n        \"\"\"Write learned or estimated parameters to a file\"\"\"\n        with self._open_text_file_write(file_name) as file_obj:\n            d = datetime.datetime.now().replace(microsecond=0)\n            file_obj.write(\n                '# Parameters for Morfessor {}, {}\\n'.format(\n                    self._version, d))\n            for (key, val) in params.items():\n                file_obj.write('{}:\\t{}\\n'.format(key, val))\n\n    def read_parameter_file(self, file_name):\n        \"\"\"Read learned or estimated parameters from a file\"\"\"\n        params = {}\n        line_re = re.compile(r'^(.*)\\s*:\\s*(.*)$')\n        for line in self._read_text_file(file_name):\n            m = line_re.match(line.rstrip())\n            if m:\n                key = m.group(1)\n                val = m.group(2)\n                try:\n                    val = float(val)\n                except ValueError:\n                    pass\n                params[key] = val\n        return params\n\n    def read_any_model(self, file_name):\n        \"\"\"Read a file that is either a binary model or a Morfessor 1.0 style\n        model segmentation. This method can not be used on standard input as\n        data might need to be read multiple times\"\"\"\n        try:\n            model = self.read_binary_model_file(file_name)\n            _logger.info(\"%s was read as a binary model\", file_name)\n            return model\n        except BaseException:\n            pass\n\n        from morfessor import BaselineModel\n        model = BaselineModel()\n        model.load_segmentations(self.read_segmentation_file(file_name))\n        _logger.info(\"%s was read as a segmentation\", file_name)\n        return model\n\n    def format_constructions(self, constructions, csep=None, atom_sep=None):\n        \"\"\"Return a formatted string for a list of constructions.\"\"\"\n        if csep is None:\n            csep = self.construction_separator\n        if atom_sep is None:\n            atom_sep = self.atom_separator\n        if utils._is_string(constructions[0]):\n            # Constructions are strings\n            return csep.join(constructions)\n        else:\n            # Constructions are not strings (should be tuples of strings)\n            return csep.join(map(lambda x: atom_sep.join(x), constructions))\n\n    def _split_atoms(self, construction):\n        \"\"\"Split construction to its atoms.\"\"\"\n        if self.atom_separator is None:\n            return construction\n        else:\n            return tuple(self._atom_sep_re.split(construction))\n\n    def _open_text_file_write(self, file_name):\n        \"\"\"Open a file for writing with the appropriate compression/encoding\"\"\"\n        if file_name == '-':\n            file_obj = sys.stdout\n            if PY3:\n                return file_obj\n        elif file_name.endswith('.gz'):\n            file_obj = gzip.open(file_name, 'wb')\n        elif file_name.endswith('.bz2'):\n            file_obj = bz2.BZ2File(file_name, 'wb')\n        else:\n            file_obj = open(file_name, 'wb')\n        if self.encoding is None:\n            # Take encoding from locale if not set so far\n            self.encoding = locale.getpreferredencoding()\n        return codecs.getwriter(self.encoding)(file_obj)\n\n    def _open_text_file_read(self, file_name):\n        \"\"\"Open a file for reading with the appropriate compression/encoding\"\"\"\n        if file_name == '-':\n            if PY3:\n                inp = sys.stdin\n            else:\n                class StdinUnicodeReader:\n                    def __init__(self, encoding):\n                        self.encoding = encoding\n                        if self.encoding is None:\n                            self.encoding = locale.getpreferredencoding()\n\n                    def __iter__(self):\n                        return self\n\n                    def next(self):\n                        l = sys.stdin.readline()\n                        if not l:\n                            raise StopIteration()\n                        return l.decode(self.encoding)\n\n                inp = StdinUnicodeReader(self.encoding)\n        else:\n            if file_name.endswith('.gz'):\n                file_obj = gzip.open(file_name, 'rb')\n            elif file_name.endswith('.bz2'):\n                file_obj = bz2.BZ2File(file_name, 'rb')\n            else:\n                file_obj = open(file_name, 'rb')\n            if self.encoding is None:\n                # Try to determine encoding if not set so far\n                self.encoding = self._find_encoding(file_name)\n            inp = codecs.getreader(self.encoding)(file_obj)\n        return inp\n\n    def _read_text_file(self, file_name, raw=False):\n        \"\"\"Read a text file with the appropriate compression and encoding.\n\n        Comments and empty lines are skipped unless raw is True.\n\n        \"\"\"\n        inp = self._open_text_file_read(file_name)\n        try:\n            for line in inp:\n                line = line.rstrip()\n                if not raw and \\\n                   (len(line) == 0 or line.startswith(self.comment_start)):\n                    continue\n                if self.lowercase:\n                    yield line.lower()\n                else:\n                    yield line\n        except KeyboardInterrupt:\n            if file_name == '-':\n                _logger.info(\"Finished reading from stdin\")\n                return\n            else:\n                raise\n\n    @staticmethod\n    def _find_encoding(*files):\n        \"\"\"Test default encodings on reading files.\n\n        If no encoding is given, this method can be used to test which\n        of the default encodings would work.\n\n        \"\"\"\n        test_encodings = ['utf-8', locale.getpreferredencoding()]\n        for encoding in test_encodings:\n            ok = True\n            for f in files:\n                if f == '-':\n                    continue\n                try:\n                    if f.endswith('.gz'):\n                        file_obj = gzip.open(f, 'rb')\n                    elif f.endswith('.bz2'):\n                        file_obj = bz2.BZ2File(f, 'rb')\n                    else:\n                        file_obj = open(f, 'rb')\n\n                    for _ in codecs.getreader(encoding)(file_obj):\n                        pass\n                except UnicodeDecodeError:\n                    ok = False\n                    break\n            if ok:\n                _logger.info(\"Detected %s encoding\", encoding)\n                return encoding\n\n        raise UnicodeError(\"Can not determine encoding of input files\")\n"
  },
  {
    "path": "morfessor/test/__init__.py",
    "content": "__author__ = 'psmit'\n"
  },
  {
    "path": "morfessor/test/evaluation.py",
    "content": "import unittest\nimport itertools\n\nfrom morfessor.evaluation import WilcoxonSignedRank\n\nclass TestWilcoxon(unittest.TestCase):\n    def setUp(self):\n        self.obj = WilcoxonSignedRank()\n\n    def test_norm_cum_pdf(self):\n        self.assertAlmostEqual(self.obj._norm_cum_pdf(1.9599639845400), 0.025)\n\n    def test_accuracy_wilcoxon(self):\n        #Same tests as used for scipy.stats.morestats\n        freq = [1, 4, 16, 15, 8, 4, 5, 1, 2]\n        nums = range(-4, 5)\n        x = list(itertools.chain(*[[u] * v for u, v in zip(nums, freq)]))\n\n        self.assertEqual(len(x), 56)\n\n        p = self.obj._wilcoxon(x, correction=False)\n        self.assertAlmostEqual(p, 0.00197547303533107)\n\n        p = self.obj._wilcoxon(x, \"wilcox\", correction=False)\n        self.assertAlmostEqual(p, 0.00641346115861)\n\n        x = [120, 114, 181, 188, 180, 146, 121, 191, 132, 113, 127, 112]\n        y = [133, 143, 119, 189, 112, 199, 198, 113, 115, 121, 142, 187]\n\n        p = self.obj._wilcoxon([(a - b) for a, b in zip(x, y)])\n        self.assertAlmostEqual(p, 0.7240817)\n        p = self.obj._wilcoxon([(a - b) for a, b in zip(x, y)], correction=False)\n        self.assertAlmostEqual(p, 0.6948866)\n\n\n    def test_wilcoxon_tie(self):\n        #Same tests as used for scipy.stats.morestats\n\n        p = self.obj._wilcoxon([0.1] * 10, correction=False)\n        self.assertAlmostEqual(p, 0.001565402)\n\n        p = self.obj._wilcoxon([0.1] * 10)\n        self.assertAlmostEqual(p, 0.001904195)\n\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "morfessor/utils.py",
    "content": "\"\"\"Data structures and functions of general utility,\nshared between different modules and variants of the software.\n\"\"\"\n\nimport logging\nimport math\nimport random\nimport sys\nimport types\n\n\nPY3 = sys.version_info[0] == 3\n\n\ndef _dummy_lru_cache(*args, **kwargs):\n    return lambda func: func\n\n\nif PY3:\n    from functools import lru_cache\nelse:\n    try:\n        # Backport for lru_cache\n        from backports.functools_lru_cache import lru_cache\n    except ImportError:\n        logging.warning(\n            \"LRU cache disabled, install backports.functools_lru_cache to enable.\")\n        lru_cache = _dummy_lru_cache\n\n\nLOGPROB_ZERO = 1000000\n\n\n# Progress bar for generators (length unknown):\n# Print a dot for every GENERATOR_DOT_FREQ:th dot.\n# Set to <= 0 to disable progress bar.\nGENERATOR_DOT_FREQ = 500\n\n\nshow_progress_bar = True\n\n\ndef _progress(iter_func):\n    \"\"\"Decorator/function for displaying a progress bar when iterating\n    through a list.\n\n    iter_func can be both a function providing a iterator (for decorator\n    style use) or an iterator itself.\n\n    No progressbar is displayed when the show_progress_bar variable is set to\n     false.\n\n    If the progressbar module is available a fancy percentage style\n    progressbar is displayed. Otherwise 60 dots are printed as indicator.\n\n    \"\"\"\n\n    if not show_progress_bar:\n        return iter_func\n\n    #Try to see or the progressbar module is available, else fabricate our own\n    try:\n        from progressbar import ProgressBar\n    except ImportError:\n        class SimpleProgressBar:\n            \"\"\"Create a simple progress bar that prints 60 dots on a single\n            line, proportional to the progress \"\"\"\n            NUM_DOTS = 60\n\n            def __init__(self):\n                self.it = None\n                self.dotfreq = 100\n                self.i = 0\n\n            def __call__(self, it):\n                self.it = iter(it)\n                self.i = 0\n\n                # Dot frequency is determined as ceil(len(it) / NUM_DOTS)\n                self.dotfreq = (len(it) + self.NUM_DOTS - 1) // self.NUM_DOTS\n                if self.dotfreq < 1:\n                    self.dotfreq = 1\n\n                return self\n\n            def __iter__(self):\n                return self\n\n            def __next__(self):\n                self.i += 1\n                if self.i % self.dotfreq == 0:\n                    sys.stderr.write('.')\n                    sys.stderr.flush()\n                try:\n                    return next(self.it)\n                except StopIteration:\n                    sys.stderr.write('\\n')\n                    raise\n\n            #Needed to be compatible with both Python2 and 3\n            next = __next__\n\n        ProgressBar = SimpleProgressBar\n\n    # In case of a decorator (argument is a function),\n    # wrap the functions result in a ProgressBar and return the new function\n    if isinstance(iter_func, types.FunctionType):\n        def i(*args, **kwargs):\n            if logging.getLogger(__name__).isEnabledFor(logging.INFO):\n                return ProgressBar()(iter_func(*args, **kwargs))\n            else:\n                return iter_func(*args, **kwargs)\n        return i\n\n    #In case of an iterator, wrap it in a ProgressBar and return it.\n    elif hasattr(iter_func, '__iter__'):\n        return ProgressBar()(iter_func)\n\n    #If all else fails, just return the original.\n    return iter_func\n\n\nclass Sparse(dict):\n    \"\"\"A defaultdict-like data structure, which tries to remain as sparse\n    as possible. If a value becomes equal to the default value, it (and the\n    key associated with it) are transparently removed.\n\n    Only supports immutable values, e.g. namedtuples.\n    \"\"\"\n\n    def __init__(self, *pargs, **kwargs):\n        \"\"\"Create a new Sparse datastructure.\n        Keyword arguments:\n            default: Default value. Unlike defaultdict this should be a\n                       prototype immutable, not a factory.\n        \"\"\"\n\n        self._default = kwargs.pop('default')\n        dict.__init__(self, *pargs, **kwargs)\n\n    def __getitem__(self, key):\n        try:\n            return dict.__getitem__(self, key)\n        except KeyError:\n            return self._default\n\n    def __setitem__(self, key, value):\n        # attribute check is necessary for unpickling\n        if '_default' in self and value == self._default:\n            if key in self:\n                del self[key]\n        else:\n            dict.__setitem__(self, key, value)\n\n\ndef ngrams(sequence, n=2):\n    \"\"\"Returns all ngram tokens in an input sequence, for a specified n.\n    E.g. ngrams(['A', 'B', 'A', 'B', 'D'], n=2) yields\n    ('A', 'B'), ('B', 'A'), ('A', 'B'), ('B', 'D')\n    \"\"\"\n\n    window = []\n    for item in sequence:\n        window.append(item)\n        if len(window) > n:\n            # trim back to size\n            window = window[-n:]\n        if len(window) == n:\n            yield tuple(window)\n\n\ndef minargmin(sequence):\n    \"\"\"Returns the minimum value and the first index at which it can be\n    found in the input sequence.\"\"\"\n    best = (None, None)\n    for (i, value) in enumerate(sequence):\n        if best[0] is None or value < best[0]:\n            best = (value, i)\n    return best\n\n\ndef zlog(x):\n    \"\"\"Logarithm which uses constant value for log(0) instead of -inf\"\"\"\n    assert x >= 0.0\n    if x == 0:\n        return LOGPROB_ZERO\n    return -math.log(x)\n\n\ndef _nt_zeros(constructor, zero=0):\n    \"\"\"Convenience function to return a namedtuple initialized to zeros,\n    without needing to know the number of fields.\"\"\"\n    zeros = [zero] * len(constructor._fields)\n    return constructor(*zeros)\n\n\ndef weighted_sample(data, num_samples):\n    \"\"\"Samples with replacement from the data set so that the probability\n    of each data point being selected is proportional to the occurrence count.\n    Arguments:\n        data: A list of tuples (weight, ...)\n        num_samples: The number of samples to return\n    Returns:\n        a sorted list of indices to data\n    \"\"\"\n    tokens = sum(x[0] for x in data)\n    token_indices = sorted([random.randint(0, tokens - 1)\n                            for _ in range(num_samples)])\n\n    data_indices = []\n    d = enumerate(x[0] for x in data)\n    di = 0\n    ti = -1\n    for sample_token_index in token_indices:\n        while ti < sample_token_index:\n            (di, weight) = next(d)\n            ti += weight\n        data_indices.append(di)\n    return data_indices\n\n\ndef _generator_progress(generator):\n    \"\"\"Prints a progress bar for visualizing flow through a generator.\n    The length of a generator is not known in advance, so the bar has\n    no fixed length. GENERATOR_DOT_FREQ controls the frequency of dots.\n\n    This function wraps the argument generator, returning a new generator.\n    \"\"\"\n\n    if GENERATOR_DOT_FREQ <= 0:\n        return generator\n\n    def _progress_wrapper(generator):\n        for (i, x) in enumerate(generator):\n            if i % GENERATOR_DOT_FREQ == 0:\n                sys.stderr.write('.')\n                sys.stderr.flush()\n            yield x\n        sys.stderr.write('\\n')\n\n    return _progress_wrapper(generator)\n\n\ndef _is_string(obj):\n    try:\n        # Python 2\n        return isinstance(obj, basestring)\n    except NameError:\n        # Python 3\n        return isinstance(obj, str)\n"
  },
  {
    "path": "scripts/morfessor",
    "content": "#!/usr/bin/env python\n\nimport sys\n\nimport morfessor\nfrom morfessor import _logger\n\n\ndef main(argv):\n    parser = morfessor.get_default_argparser()\n    try:\n        args = parser.parse_args(argv)\n        morfessor.main(args)\n    except morfessor.ArgumentException as e:\n        parser.error(e)\n    except Exception as e:\n        _logger.error(\"Fatal Error %s %s\" % (type(e), e))\n        raise\n\n\nif __name__ == \"__main__\":\n    main(sys.argv[1:])\n"
  },
  {
    "path": "scripts/morfessor-evaluate",
    "content": "#!/usr/bin/env python\n\nimport sys\n\nimport morfessor\nimport morfessor.evaluation\n\nfrom morfessor import _logger\n\n\ndef main(argv):\n    parser = morfessor.get_evaluation_argparser()\n    try:\n        args = parser.parse_args(argv)\n        morfessor.main_evaluation(args)\n    except morfessor.ArgumentException as e:\n        parser.error(e)\n    except Exception as e:\n        _logger.error(\"Fatal Error %s %s\" % (type(e), e))\n        raise\n\nif __name__ == \"__main__\":\n    main(sys.argv[1:])\n"
  },
  {
    "path": "scripts/morfessor-segment",
    "content": "#!/usr/bin/env python\n\nimport argparse\nimport sys\n\nimport morfessor\nfrom morfessor import _logger\n\n\ndef main(argv):\n    parser = morfessor.get_default_argparser()\n    parser.prog = \"morfessor-segment\"\n    parser.epilog = \"\"\"\nSimple usage example (load model.pickled and use it to segment test corpus):\n\n  %(prog)s -l model.pickled -o test_corpus.segmented test_corpus.txt\n\nInteractive use (read corpus from user):\n\n  %(prog)s -l model.pickled -\n\n\"\"\"\n\n    keep_options = ['encoding', 'loadfile', 'loadsegfile', 'outfile']\n                    # FIXME Disabled to work around an argparse bug\n                    #'help', 'version']\n    for action_group in parser._action_groups:\n        for arg in action_group._group_actions:\n            if arg.dest not in keep_options:\n                arg.help = argparse.SUPPRESS\n\n    parser.add_argument('testfiles', metavar='<file>', nargs='+',\n                        help='corpus files to segment')\n\n    try:\n        args = parser.parse_args(argv)\n        morfessor.main(args)\n    except morfessor.ArgumentException as e:\n        parser.error(e)\n    except Exception as e:\n        _logger.error(\"Fatal Error %s %s\" % (type(e), e))\n        raise\n\n\nif __name__ == \"__main__\":\n    main(sys.argv[1:])\n"
  },
  {
    "path": "scripts/morfessor-train",
    "content": "#!/usr/bin/env python\n\nimport argparse\nimport sys\n\nimport morfessor\nfrom morfessor import _logger\n\n\ndef main(argv):\n    parser = morfessor.get_default_argparser()\n    parser.prog = \"morfessor-train\"\n    parser.epilog = \"\"\"\nSimple usage example (train a model and save it to model.pickled):\n\n  %(prog)s -s model.pickled training_corpus.txt\n\nInteractive use (read corpus from user):\n\n  %(prog)s -m online -v 2 -\n\n\"\"\"\n\n    keep_options = ['savesegfile', 'savefile', 'trainmode', 'dampening',\n                    'encoding', 'list', 'skips', 'annofile', 'develfile',\n                    'fullretrain', 'threshold', 'morphtypes', 'morphlength',\n                    'corpusweight', 'annotationweight', 'help', 'version']\n    for action_group in parser._action_groups:\n        for arg in action_group._group_actions:\n            if arg.dest not in keep_options:\n                arg.help = argparse.SUPPRESS\n\n    parser.add_argument('trainfiles', metavar='<file>', nargs='+',\n                        help='training data files')\n\n    try:\n        args = parser.parse_args(argv)\n        morfessor.main(args)\n    except morfessor.ArgumentException as e:\n        parser.error(e)\n    except Exception as e:\n        _logger.error(\"Fatal Error {} {}\".format(type(e), e))\n        raise\n\n\nif __name__ == \"__main__\":\n    main(sys.argv[1:])\n"
  },
  {
    "path": "scripts/tools/morphlength_from_annotations.py",
    "content": "from __future__ import division\nimport fileinput\n\n\ndef main():\n    tot_morph_count = 0\n    tot_length = 0\n\n    for line in fileinput.input():\n        word, segm = line.strip().split(None, 1)\n        segmentations = segm.split(',')\n        num_morphs = [len([x for x in s.split(None) if x.strip().strip(\"~\") != \"\"]) for s in segmentations]\n\n        tot_morph_count += sum(num_morphs) / len(num_morphs)\n        tot_length += len(word)\n\n    print(tot_length / tot_morph_count)\n\n\nif __name__ == \"__main__\":\n    main()"
  },
  {
    "path": "setup.py",
    "content": "#!/usr/bin/env python\n\nfrom codecs import open\nfrom ez_setup import use_setuptools\nuse_setuptools()\n\nfrom setuptools import setup\n\nimport re\nmain_py = open('morfessor/__init__.py', encoding='utf-8').read()\nmetadata = dict(re.findall(\"__([a-z]+)__ = '([^']+)'\", main_py))\n\nrequires = [\n    #    'progressbar',\n]\n\nsetup(name='Morfessor',\n      version=metadata['version'],\n      author=metadata['author'],\n      author_email='morpho@aalto.fi',\n      url='http://morpho.aalto.fi/projects/morpho/morfessor2.html',\n      description='A tool for unsupervised and semi-supervised morphological segmentation',\n      packages=['morfessor', 'morfessor.test'],\n      classifiers=[\n          'Development Status :: 4 - Beta',\n          'Intended Audience :: Science/Research',\n          'License :: OSI Approved :: BSD License',\n          'Operating System :: OS Independent',\n          'Programming Language :: Python',\n          'Topic :: Scientific/Engineering',\n      ],\n      license=\"BSD\",\n      scripts=['scripts/morfessor',\n               'scripts/morfessor-train',\n               'scripts/morfessor-segment',\n               'scripts/morfessor-evaluate',\n               ],\n      install_requires=requires,\n      extras_require={\n          'docs': [l.strip() for l in open('docs/build_requirements.txt')]\n      }\n      )\n"
  }
]