Repository: Kensuke-Mitsuzawa/JapaneseTokenizers
Branch: master
Commit: 3bdfb6be73de
Files: 51
Total size: 140.3 KB

Directory structure:
gitextract_jdhxzz3y/

├── .gitignore
├── .travis.yml
├── JapaneseTokenizer/
│   ├── __init__.py
│   ├── common/
│   │   ├── __init__.py
│   │   ├── juman_utils.py
│   │   ├── sever_handler.py
│   │   ├── text_preprocess.py
│   │   └── timeout_handler.py
│   ├── datamodels.py
│   ├── init_logger.py
│   ├── juman_wrapper/
│   │   ├── __init__.py
│   │   └── juman_wrapper.py
│   ├── jumanpp_wrapper/
│   │   ├── __init__.py
│   │   └── jumanpp_wrapper.py
│   ├── kytea_wrapper/
│   │   ├── __init__.py
│   │   └── kytea_wrapper.py
│   ├── mecab_wrapper/
│   │   ├── __init__.py
│   │   └── mecab_wrapper.py
│   └── object_models.py
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.md
├── examples/
│   ├── examples.py
│   ├── userdict.csv
│   └── userdict.dict
├── install_tokenizers.sh
├── setup.py
├── test/
│   ├── Dockerfile
│   ├── Dockerfile-dev
│   ├── __init__.py
│   ├── common/
│   │   ├── __init__.py
│   │   └── test_server_handler.py
│   ├── docker-compose-dev.yml
│   ├── docker-compose.yml
│   ├── requirements_py2.txt
│   ├── requirements_py3.txt
│   ├── resources/
│   │   └── test/
│   │       ├── userdict.csv
│   │       └── userdict.dict
│   ├── test_all.py
│   ├── test_filter_python2.py
│   ├── test_filter_python3.py
│   ├── test_juman_wrapper_python2.py
│   ├── test_juman_wrapper_python3.py
│   ├── test_jumanpp_wrapper_python2.py
│   ├── test_jumanpp_wrapper_python3.py
│   ├── test_kytea_wrapper_python2.py
│   ├── test_kytea_wrapper_python3.py
│   ├── test_mecab_wrapper_python2.py
│   └── test_mecab_wrapper_python3.py
└── travis-mecab-install.sh

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
.idea/
JapaneseTokenizer.egg-info/
build/
dist/
*eggs/
pyknp.egg-info/
.python-version
*pyc
morphogySplitters/
Mykytea-python/
.DS_Store
*tox
.cache/
python/
python2/

================================================
FILE: .travis.yml
================================================
language: python
python:
  - 2.7
  - 3.5
addons:
  apt:
    packages:
    - git
    - make
    - curl
    - xz-utils
    - file
    - pandoc
    - libboost-all-dev
    - language-pack-ja-base
    - language-pack-ja
    - ibus-mozc
    - gcc-5
    - g++-5
    - build-essential
    - swig
    sources:
    - ubuntu-toolchain-r-test
before_install:
  - sudo apt-get update -qq
  - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1
  - sudo update-locale LANG=ja_JP.UTF-8 LANGUAGE="ja_JP:ja"
  - mkdir ./target
  - export CC="gcc-5"
  - export CXX="g++-5"
  - export CFLAGS=-std=c++11
  - export CXXFLAGS=-std=c++11
  - sudo bash travis-mecab-install.sh
  - which mecab-config
  - sudo make install
  - git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
  - cd mecab-ipadic-neologd && echo yes | sudo ./bin/install-mecab-ipadic-neologd && cd ../
  - sudo juman -S
install:
  - python --version
  - python setup.py install
  - pip install coveralls coverage nose
script:
  - coverage run --source=JapaneseTokenizer setup.py test
after_success:
  - coveralls
notifications:
  email:
    recipients:
      - kensuke.mit@gmail.com
    on_success: always
    on_failure: always

================================================
FILE: JapaneseTokenizer/__init__.py
================================================
from JapaneseTokenizer.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.juman_wrapper import JumanWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence
from JapaneseTokenizer.datamodels import FilteredObject
from JapaneseTokenizer.kytea_wrapper import KyteaWrapper
from JapaneseTokenizer.jumanpp_wrapper import JumanppWrapper

================================================
FILE: JapaneseTokenizer/common/__init__.py
================================================
__author__ = 'kensuke-mi'


================================================
FILE: JapaneseTokenizer/common/juman_utils.py
================================================
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence
from typing import Tuple
import pyknp
from six import text_type

"""These functions are for utilization of Juman"""


def extract_morphological_information(mrph_object, is_feature, is_surface):
    # type: (pyknp.Morpheme, bool, bool) -> TokenizedResult
    """This method extracts morphlogical information from token object.
    """
    assert isinstance(mrph_object, pyknp.Morpheme)
    assert isinstance(is_feature, bool)
    assert isinstance(is_surface, bool)

    surface = mrph_object.midasi
    word_stem = mrph_object.genkei

    tuple_pos = (mrph_object.hinsi, mrph_object.bunrui)

    misc_info = {
        'katuyou1': mrph_object.katuyou1,
        'katuyou2': mrph_object.katuyou2,
        'imis': mrph_object.imis,
        'repname': mrph_object.repname
    }

    token_object = TokenizedResult(
        node_obj=None,
        tuple_pos=tuple_pos,
        word_stem=word_stem,
        word_surface=surface,
        is_feature=is_feature,
        is_surface=is_surface,
        misc_info=misc_info
    )

    return token_object


def feature_parser(uni_feature, word_surface):
    # type: (text_type, text_type) -> Tuple[Tuple[text_type, text_type, text_type], text_type]
    """
    Parse the POS feature output by Mecab
    :param uni_feature unicode:
    :return ( (pos1, pos2, pos3), word_stem ):
    """
    list_feature_items = uni_feature.split(',')
    # if word has no feature at all
    if len(list_feature_items) == 1: return ('*'), ('*')

    pos1 = list_feature_items[0]
    pos2 = list_feature_items[1]
    pos3 = list_feature_items[2]
    tuple_pos = (pos1, pos2, pos3)

    # if without constraint(output is normal mecab dictionary like)
    if len(list_feature_items) == 9:
        word_stem = list_feature_items[6]
    # if with constraint(output format depends on Usedict.txt)
    else:
        word_stem = word_surface

    return tuple_pos, word_stem


================================================
FILE: JapaneseTokenizer/common/sever_handler.py
================================================
#! -*- coding: utf-8 -*-
import subprocess
from subprocess import Popen, PIPE, STDOUT
import multiprocessing
# socket object
import socket
# logger
from JapaneseTokenizer import init_logger
import logging
# typing
from typing import Union
# else
from six import text_type
import six
import pexpect
import shutil
import signal
import os
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))


class ProcessDownException(Exception):
    pass


class UnixProcessHandler(object):
    def __init__(self,
                 command,
                 option=None,
                 pattern='EOS',
                 timeout_second=10):
        # type: (text_type,text_type,text_type,int)->None
        """* Get communication with unix process using pexpect module."""
        self.command = command
        self.timeout_second = timeout_second
        self.pattern = pattern
        self.option = option
        self.launch_process(command)

    def __del__(self):
        if hasattr(self, "process_analyzer"):
            self.process_analyzer.kill(sig=9)

    def launch_process(self, command):
        # type: (Union[bytes,text_type])->None
        """* What you can do
        - It starts process and keep it.
        """
        if not self.option is None:
            command_plus_option = self.command + " " + self.option
        else:
            command_plus_option = self.command

        if six.PY3:
            if shutil.which(command) is None:
                raise Exception("No command at {}".format(command))
            else:
                self.process_analyzer = pexpect.spawnu(command_plus_option)
                self.process_id = self.process_analyzer.pid
        else:
            doc_command_string = "echo '' | {}".format(command)
            command_check = os.system(doc_command_string)
            if not command_check == 0:
                raise Exception("No command at {}".format(command))
            else:
                self.process_analyzer = pexpect.spawnu(command_plus_option)
                self.process_id = self.process_analyzer.pid

    def restart_process(self):
        # type: ()->None
        if not self.option is None:
            command_plus_option = self.command + " " + self.option
        else:
            command_plus_option = self.command

        self.process_analyzer.kill(sig=9)
        self.process_analyzer = pexpect.spawnu(command_plus_option)
        self.process_id = self.process_analyzer.pid

    def stop_process(self):
        # type: ()->bool
        """* What you can do
        - You're able to stop the process which this instance has now.
        """
        if hasattr(self, "process_analyzer"):
            self.process_analyzer.kill(sig=9)
        else:
            pass

        return True

    def __query(self, input_string):
        # type: (text_type)->text_type
        """* What you can do
        - It takes the result of Juman++
        - This function monitors time which takes for getting the result.
        """
        signal.signal(signal.SIGALRM, self.__notify_handler)
        signal.alarm(self.timeout_second)
        self.process_analyzer.sendline(input_string)
        buffer = ""
        while True:
            line_string = self.process_analyzer.readline()  # type: text_type
            if line_string.strip() == input_string:
                """Skip if process returns the same input string"""
                continue
            elif line_string.strip() == self.pattern:
                buffer += line_string
                signal.alarm(0)
                return buffer
            else:
                buffer += line_string

    def __notify_handler(self, signum, frame):
        raise ProcessDownException("""It takes longer time than {time} seconds. You're able to try, 
        1. Change your setting of 'timeout_second' parameter
        2. Run restart_process() method when the exception happens.""".format(**{"time": self.timeout_second}))

    def query(self, input_string):
        # type: (text_type)->text_type
        return self.__query(input_string=input_string)


class JumanppHnadler(UnixProcessHandler):

    def __init__(self,
                 jumanpp_command,
                 option = None,
                 pattern = 'EOS',
                 timeout_second = 10):
        # type: (text_type,text_type,text_type,int)->None
        super(JumanppHnadler, self).__init__(command=jumanpp_command, option=option, pattern=pattern, timeout_second=timeout_second)

    def launch_jumanpp_process(self, command):
        # type: (text_type)->None
        return self.launch_process(command)


================================================
FILE: JapaneseTokenizer/common/text_preprocess.py
================================================
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from six import text_type
import jaconv
import six
import re
import unicodedata
from JapaneseTokenizer import init_logger
import logging
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
__author__ = 'kensuke-mi'

if six.PY2:
    def u(str): return str.decode("utf-8")
    def b(str): return str
    pass
else: # python3
    def u(str): return str
    def b(str): return str.encode("utf-8")
    pass

try:
    import neologdn
    is_neologdn_valid = True
except:
    logger.warning("neologdn package is not installed yet. You could not call neologd dictionary.")
    is_neologdn_valid = False

STRING_EXCEPTION = set([u('*')])


def denormalize_text(input_text):
    # type: (text_type)->text_type
    """* What you can do
    - It converts text into standard japanese writing way

    * Note
    - hankaku-katakana is to zenkaku-katakana
    - zenkaku-eisu is to hankaku-eisu
    """
    if input_text in STRING_EXCEPTION:
        return input_text
    else:
        return jaconv.z2h(input_text, kana=False, ascii=True, digit=True)


def normalize_text(input_text,
                   dictionary_mode='ipadic',
                   new_line_replaced='。',
                   is_replace_eos=True,
                   is_kana=True,
                   is_ascii=True,
                   is_digit=True):
    # type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type
    """* What you can do
    - It converts input-text into normalized-text which is good for tokenizer input.

    * Params
    - new_line_replaced: a string which replaces from \n string.
    """
    if is_replace_eos:
        without_new_line = input_text.replace('\n', new_line_replaced)
    else:
        without_new_line = new_line_replaced

    if dictionary_mode=='neologd' and is_neologdn_valid:
        return neologdn.normalize(normalize_text_normal_ipadic(without_new_line))
    elif dictionary_mode=='neologd' and is_neologdn_valid == False:
        raise Exception("You could not call neologd dictionary bacause you do NOT install the package neologdn.")
    else:
        return normalize_text_normal_ipadic(without_new_line, kana=is_kana, ascii=is_ascii, digit=is_digit)


def normalize_text_normal_ipadic(input_text, kana=True, ascii=True, digit=True):
    # type: (text_type,bool,bool,bool)->text_type
    """
    * All hankaku Katanaka is converted into Zenkaku Katakana
    * All hankaku English alphabet and numberc string are converted into Zenkaku one
    """
    return jaconv.h2z(input_text, kana=kana, ascii=ascii, digit=digit)


================================================
FILE: JapaneseTokenizer/common/timeout_handler.py
================================================
#! -*- coding: utf-8 -*-
from functools import wraps


class TimeoutException(Exception):
    pass


def handler_func(msg):
    raise TimeoutException()


def on_timeout(limit, handler=handler_func, hint=None):
    """
    指定した実行時間に終了しなかった場合、handlerをhint/limitを引数にして呼び出します
    @on_timeout(limit=3600, handler=notify_func, hint=u'長い計算')
    def long_time_function():
    """
    def notify_handler(signum, frame):
        handler("'%s' is not finished in %d second(s)." % (hint, limit))

    def __decorator(function):
        def __wrapper(*args, **kwargs):
            import signal
            signal.signal(signal.SIGALRM, notify_handler)
            signal.alarm(limit)
            result = function(*args, **kwargs)
            signal.alarm(0)
            return result
        return wraps(function)(__wrapper)
    return __decorator


================================================
FILE: JapaneseTokenizer/datamodels.py
================================================
#! -*- coding: utf-8 -*-
# normalize module #
from JapaneseTokenizer.common.text_preprocess import normalize_text, denormalize_text
# datemodels #
from MeCab import Node
# typing #
from typing import List, Union, Any, Tuple, Dict, Callable, Optional
from future.utils import text_type, string_types
import sys
import six
__author__ = 'kensuke-mi'

python_version = sys.version_info


def __is_sotpwords(token, stopwords):
    """This function filters out stopwords. If token is in stopwords list, return True; else return False
    """
    if token in stopwords:
        return True
    else:
        return False


def __is_valid_pos(pos_tuple, valid_pos):
    # type: (Tuple[text_type,...],List[Tuple[text_type,...]])->bool
    """This function checks token's pos is with in POS set that user specified.
    If token meets all conditions, Return True; else return False
    """
    def is_valid_pos(valid_pos_tuple):
        # type: (Tuple[text_type,...])->bool
        length_valid_pos_tuple = len(valid_pos_tuple)
        if valid_pos_tuple == pos_tuple[:length_valid_pos_tuple]:
            return True
        else:
            return False

    seq_bool_flags = [is_valid_pos(valid_pos_tuple) for valid_pos_tuple in valid_pos]

    if True in set(seq_bool_flags):
        return True
    else:
        return False


def filter_words(tokenized_obj, valid_pos, stopwords, check_field_name='stem'):
    # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type],text_type) -> FilteredObject
    """This function filter token that user don't want to take.
    Condition is stopword and pos.

    * Input
    - valid_pos
        - List of Tuple which has POS element to keep.
        - Keep in your mind, each tokenizer has different POS structure.
         >>> [('名詞', '固有名詞'), ('動詞', )]
    - stopwords
        - List of str, which you'd like to remove
        >>> ['残念', '今日']
    """
    assert isinstance(tokenized_obj, TokenizedSenetence)
    assert isinstance(valid_pos, list)
    assert isinstance(stopwords, list)

    filtered_tokens = []
    for token_obj in tokenized_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        if check_field_name=='stem':
            res_stopwords = __is_sotpwords(token_obj.word_stem, stopwords)
        else:
            res_stopwords = __is_sotpwords(token_obj.word_surface, stopwords)

        res_pos_condition = __is_valid_pos(token_obj.tuple_pos, valid_pos)

        # case1: only pos filtering is ON
        if valid_pos != [] and stopwords == []:
            if res_pos_condition: filtered_tokens.append(token_obj)
        # case2: only stopwords filtering is ON
        if valid_pos == [] and stopwords != []:
            if res_stopwords is False: filtered_tokens.append(token_obj)
        # case3: both condition is ON
        if valid_pos != [] and stopwords != []:
            if res_stopwords is False and res_pos_condition: filtered_tokens.append(token_obj)

    filtered_object = FilteredObject(
        sentence=tokenized_obj.sentence,
        tokenized_objects=filtered_tokens,
        pos_condition=valid_pos,
        stopwords=stopwords
    )

    return filtered_object


class TokenizedResult(object):
    def __init__(self,
                 node_obj,
                 tuple_pos,
                 word_stem,
                 word_surface,
                 is_feature=True,
                 is_surface=False,
                 misc_info=None,
                 analyzed_line=None):
        # type: (Optional[Node], Tuple[text_type, ...], str, str, bool, bool, Optional[Dict[str, Any]], str)->None
        assert isinstance(node_obj, (Node, type(None)))
        assert isinstance(tuple_pos, (string_types, tuple))
        assert isinstance(word_stem, (string_types))
        assert isinstance(word_surface, text_type)
        assert isinstance(misc_info, (type(None), dict))

        self.node_obj = node_obj
        self.word_stem = word_stem
        self.word_surface = word_surface
        self.is_surface = is_surface
        self.is_feature = is_feature
        self.misc_info = misc_info
        self.analyzed_line = analyzed_line

        if isinstance(tuple_pos, tuple):
            self.tuple_pos = tuple_pos
        elif isinstance(tuple_pos, string_types):
            self.tuple_pos = ('*', )
        else:
            raise Exception('Error while parsing feature object. {}'.format(tuple_pos))


class TokenizedSenetence(object):
    def __init__(self, sentence, tokenized_objects, string_encoding='utf-8'):
        # type: (text_type, List[TokenizedResult], text_type)->None
        """* Parameters
        - sentence: sentence
        - tokenized_objects: list of TokenizedResult object
        - string_encoding: Encoding type of string type. This option is used only under python2.x
        """
        assert isinstance(sentence, text_type)
        assert isinstance(tokenized_objects, list)

        self.sentence = sentence
        self.tokenized_objects = tokenized_objects
        self.string_encoding = string_encoding


    def __extend_token_object(self, token_object,
                              is_denormalize=True,
                              func_denormalizer=denormalize_text):
        # type: (TokenizedResult,bool,Callable[[str],str])->Tuple
        """This method creates dict object from token object.
        """
        assert isinstance(token_object, TokenizedResult)

        if is_denormalize:
            if token_object.is_feature == True:
                if token_object.is_surface == True:
                    token = (func_denormalizer(token_object.word_surface), token_object.tuple_pos)
                else:
                    token = (func_denormalizer(token_object.word_stem), token_object.tuple_pos)
            else:
                if token_object.is_surface == True:
                    token = func_denormalizer(token_object.word_surface)
                else:
                    token = func_denormalizer(token_object.word_stem)
        else:
            if token_object.is_feature == True:
                if token_object.is_surface == True:
                    token = (token_object.word_surface, token_object.tuple_pos)
                else:
                    token = (token_object.word_stem, token_object.tuple_pos)
            else:
                if token_object.is_surface == True:
                    token = token_object.word_surface
                else:
                    token = token_object.word_stem

        return token

    def convert_list_object(self,
                            is_denormalize=True,
                            func_denormalizer=denormalize_text):
        # type: (bool,Callable[[str],str])->List[Union[str, Tuple[str,...]]]
        """* What you can do
        - You extract string object from TokenizedResult object

        * Args
        - is_denormalize: boolen object. True; it makes denormalize string
        - func_denormalizer: callable object. de-normalization function.
        """
        sentence_in_list_obj = [
            self.__extend_token_object(token_object,is_denormalize,func_denormalizer)
            for token_object
            in self.tokenized_objects
        ]

        return sentence_in_list_obj

    def __convert_string_type(self, p_c_tuple):
        # type: (Tuple[text_type,...])->Tuple[text_type]
        """* What you can do
        - it normalizes string types into str
        """
        if not isinstance(p_c_tuple, tuple):
            raise Exception('Pos condition expects tuple of string. However = {}'.format(p_c_tuple))

        converted = [text_type] * len(p_c_tuple)
        for i, pos_element in enumerate(p_c_tuple):
            if six.PY2 and isinstance(pos_element, str):
                """str into unicode if python2.x"""
                converted[i] = pos_element.decode(self.string_encoding)
            elif six.PY2 and isinstance(pos_element, text_type):
                converted[i] = pos_element
            elif six.PY3:
                converted[i] = pos_element
            else:
                raise Exception()

        return tuple(converted)

    def __check_pos_condition(self, pos_condistion):
        # type: (List[Tuple[text_type, ...]])->List[Tuple[text_type, ...]]
        """* What you can do
        - Check your pos condition
        - It converts character type into unicode if python version is 2.x
        """
        assert isinstance(pos_condistion, list)

        return [self.__convert_string_type(p_c_tuple) for p_c_tuple in pos_condistion]

    def filter(self,
               pos_condition=None,
               stopwords=None,
               is_normalize=True,
               func_normalizer=normalize_text,
               check_field_name='stem'):
        # type: (List[Tuple[text_type,...]], List[text_type], bool, Callable[[text_type], text_type],text_type)->FilteredObject
        """* What you can do
        - It filters out token which does NOT meet the conditions (stopwords & part-of-speech tag)
        - Under python2.x, pos_condition & stopwords are converted into unicode type.

        * Parameters
        - pos_condition: list of part-of-speech(pos) condition. The pos condition is tuple is variable length.
        You can specify hierarchical structure of pos condition with variable tuple.
        The hierarchy of pos condition follows definition of dictionary.
            - For example, in mecab you can take words with 名詞 if ('名詞',)
            - For example, in mecab you can take words with 名詞-固有名詞 if ('名詞', '固有名詞')
        - stopwords: list of word which you would like to remove
        - is_normalize: Boolean flag for normalize stopwords.
        - func_normalizer: Function object for normalization. The function object must be the same one as when you use tokenize.
        - check_field_name: Put field name to check if stopword or NOT. Kytea does not have stem form of word, put 'surface' instead.

        * Example
        >>> pos_condition = [('名詞', '一般'), ('形容詞', '自立'), ('助詞', '格助詞', '一般')]
        >>> stopwords = ['これ', 'それ']
        """
        assert isinstance(pos_condition, (type(None), list))
        assert isinstance(stopwords, (type(None), list))

        if stopwords is None:
            s_words = []
        elif six.PY2 and all((isinstance(s, str) for s in stopwords)):
            """under python2.x, from str into unicode"""
            if is_normalize:
                s_words = [func_normalizer(s.decode(self.string_encoding)) for s in stopwords]
            else:
                s_words = [s.decode(self.string_encoding) for s in stopwords]
        else:
            if is_normalize:
                s_words = [func_normalizer(s) for s in stopwords]
            else:
                s_words = stopwords


        if pos_condition is None:
            p_condition = []
        else:
            p_condition = self.__check_pos_condition(pos_condition)

        filtered_object = filter_words(
            tokenized_obj=self,
            valid_pos=p_condition,
            stopwords=s_words,
            check_field_name=check_field_name
        )
        assert isinstance(filtered_object, FilteredObject)

        return filtered_object


class FilteredObject(TokenizedSenetence):
    def __init__(self, sentence, tokenized_objects, pos_condition, stopwords):
        # type: (str, List[TokenizedResult], List[str, ...], List[str])->None
        super(FilteredObject, self).__init__(
            sentence=sentence,
            tokenized_objects=tokenized_objects
        )
        self.pos_condition=pos_condition
        self.stopwords=stopwords


================================================
FILE: JapaneseTokenizer/init_logger.py
================================================
LOGGER_NAME = 'JapaneseTokenizer'

import logging
import sys
from logging import getLogger, Formatter, Logger, StreamHandler

# Formatter
custmoFormatter = Formatter(
    fmt='[%(asctime)s]%(levelname)s - %(filename)s#%(funcName)s:%(lineno)d: %(message)s',
    datefmt='Y/%m/%d %H:%M:%S'
)

# StreamHandler
STREAM_LEVEL = logging.DEBUG
STREAM_FORMATTER = custmoFormatter
STREAM = sys.stderr

st_handler = StreamHandler(stream=STREAM)
st_handler.setLevel(STREAM_LEVEL)
st_handler.setFormatter(STREAM_FORMATTER)


def init_logger(logger):
    # type: (logging.Logger) -> logging.Logger
    logger.addHandler(st_handler)
    logger.propagate = False

    return logger


================================================
FILE: JapaneseTokenizer/juman_wrapper/__init__.py
================================================
__author__ = 'kensuke-mi'
from .juman_wrapper import JumanWrapper


================================================
FILE: JapaneseTokenizer/juman_wrapper/juman_wrapper.py
================================================
# -*- coding: utf-8 -*-
# package module
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common import text_preprocess
from JapaneseTokenizer.datamodels import FilteredObject, TokenizedResult, TokenizedSenetence
from JapaneseTokenizer import init_logger
from JapaneseTokenizer.common.sever_handler import JumanppHnadler
# else
from typing import List, Union, Callable, Tuple
from six import text_type
from pyknp import MList
import logging
import sys
import os
import six

logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
__author__ = 'kensuke-mi'

python_version = sys.version_info

try:
    import pyknp
except ImportError:
    logger.warning(msg='pyknp is not ready to use. Install first if you would like to use pyknp wrapper.')

if six.PY3:
    import socket
    import re

    class MonkeyPatchSocket(object):
        """* Class for overwriting pyknp.Socket because it is only for python2.x"""
        def __init__(self, hostname, port, option=None):
            try:
                self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                self.sock.connect((hostname, port))
            except:
                raise
            if option is not None:
                self.sock.send(option)
            data = b""
            while b"OK" not in data:
                # while isinstance(data, bytes) and b"OK" not in data:
                data = self.sock.recv(1024)

        def __del__(self):
            if self.sock:
                self.sock.close()

        def query(self, sentence, pattern):
            # type: (str,str)->str
            assert(isinstance(sentence, six.text_type))
            sentence_bytes = sentence.encode('utf-8').strip()
            pattern_bytes = pattern.encode('utf-8')

            self.sock.sendall(sentence_bytes + b"\n")
            data = self.sock.recv(1024)
            assert isinstance(data, bytes)
            recv = data
            while not re.search(pattern_bytes, recv):
                data = self.sock.recv(1024)
                recv = recv + data
            return recv.strip().decode('utf-8')


class JumanWrapper(WrapperBase):
    def __init__(self,
                 command='juman',
                 server=None,
                 port=32000,
                 timeout=30,
                 rcfile=None,
                 option='-e2 -B',
                 pattern='EOS',
                 is_use_pyknp=False,
                 **args):
        # type: (text_type, text_type, int, int, text_type, Union[bytes, text_type], Union[bytes, text_type], bool, **str)->None
        """* Class to call Juman tokenizer
        """

        self.timeout = timeout
        self.pattern = pattern
        self.option = option
        self.command = command
        if not rcfile is None and not os.path.exists(rcfile):
            raise FileExistsError('rcfile does not exist at {}'.format(rcfile))
        if not server is None:
            # It converts from str into bytes only for sever mode #
            self.option = self.option.encode('utf-8')  # type: Union[str,bytes]
            self.pattern = self.pattern.encode('utf-8')  # type: Union[str,bytes]
        else:
            pass

        # check os #
        if os.name == 'nt':
            if not is_use_pyknp:
                logger.warning(msg='It forces is_use_pyknp = True on Windows.')
            else:
                pass
            self.is_use_pyknp = True
        else:
            pass

        if server is not None:
            # use server mode #
            self.juman = pyknp.Juman(command=command, server=server, port=port,
                                     timeout=self.timeout, rcfile=rcfile, option=option,
                                     pattern=pattern, jumanpp=False, **args)
            if six.PY3:
                # It overwrites juman_lines() method #
                self.juman.juman_lines = self.__monkey_patch_juman_lines
        elif is_use_pyknp and server is None:
            # use unix process with pyknp
            self.juman = pyknp.Juman(command=command, server=server, port=port,
                                     timeout=self.timeout, rcfile=rcfile, option=option,
                                     pattern=pattern, jumanpp=False, **args)
        else:
            # use unix process with pexpect(RECOMMENDED) #
            self.juman = JumanppHnadler(jumanpp_command=command,
                                        option=self.option,
                                        pattern=self.pattern,
                                        timeout_second=self.timeout)

    def __del__(self):
        if hasattr(self, "juman"):
            if isinstance(self.juman, JumanppHnadler):
                self.juman.stop_process()

    def __monkey_patch_juman_lines(self, input_str):
        # type: (text_type)->text_type
        """* What you can do
        - It overwrites juman_line() method because this method causes TypeError in python3
        """
        assert isinstance(self.juman, pyknp.Juman)
        if not self.juman.socket and not self.juman.subprocess:
            if self.juman.server is not None:
                self.juman.socket = MonkeyPatchSocket(self.juman.server, self.juman.port, b"RUN -e2\n")
            else:
                command = "%s %s" % (self.juman.command, self.juman.option)
                if self.juman.rcfile:
                    command += " -r %s" % self.juman.rcfile
                self.juman.subprocess = pyknp.Subprocess(command)
        if self.juman.socket:
            return self.juman.socket.query(input_str, pattern=self.juman.pattern)
        return self.juman.subprocess.query(input_str, pattern=self.juman.pattern)

    def __extract_morphological_information(self, mrph_object, is_feature, is_surface):
        """This method extracts morphlogical information from token object.
        """
        assert isinstance(mrph_object, pyknp.Morpheme)
        assert isinstance(is_feature, bool)
        assert isinstance(is_surface, bool)

        surface = mrph_object.midasi
        word_stem = mrph_object.genkei

        tuple_pos = (mrph_object.hinsi, mrph_object.bunrui)

        misc_info = {
            'katuyou1': mrph_object.katuyou1,
            'katuyou2': mrph_object.katuyou2,
            'imis': mrph_object.imis,
            'repname': mrph_object.repname
        }

        token_object = TokenizedResult(
            node_obj=None,
            tuple_pos=tuple_pos,
            word_stem=word_stem,
            word_surface=surface,
            is_feature=is_feature,
            is_surface=is_surface,
            misc_info=misc_info
        )

        return token_object

    def call_juman_interface(self, input_str):
        # type: (text_type)->MList
        if isinstance(self.juman, pyknp.Juman):
            result = self.juman.analysis(input_str)
            return result
        elif isinstance(self.juman, JumanppHnadler):
            try:
                result_analysis = self.juman.query(input_str)
            except UnicodeDecodeError:
                logger.warning(msg="Process is down by some reason. It restarts process automatically.")
                self.juman.restart_process()
                result_analysis = self.juman.query(input_string=input_str)
            return MList(result_analysis)
        else:
            raise Exception('Not defined.')

    def tokenize(self,
                 sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (text_preprocess, bool, bool, bool, bool, Callable[[str], text_type])->Union[List[text_type], TokenizedSenetence]
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, text_type)
        normalized_sentence = func_normalizer(sentence)
        result = self.call_juman_interface(normalized_sentence)

        token_objects = [
            self.__extract_morphological_information(
                mrph_object=morph_object,
                is_surface=is_surface,
                is_feature=is_feature
            )
            for morph_object in result]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects
            )
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)

            return tokenized_objects

    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
        # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type])->FilteredObject
        assert isinstance(parsed_sentence, TokenizedSenetence)
        assert isinstance(pos_condition, (type(None), list))
        assert isinstance(stopwords, (type(None), list))

        return parsed_sentence.filter(pos_condition, stopwords)


================================================
FILE: JapaneseTokenizer/jumanpp_wrapper/__init__.py
================================================
from .jumanpp_wrapper import JumanppWrapper

================================================
FILE: JapaneseTokenizer/jumanpp_wrapper/jumanpp_wrapper.py
================================================
#! -*- coding: utf-8 -*-
from pyknp import Juman
from pyknp import MList
# modules
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common import text_preprocess, juman_utils
from JapaneseTokenizer.common.sever_handler import JumanppHnadler, ProcessDownException
from JapaneseTokenizer import init_logger
from JapaneseTokenizer.datamodels import FilteredObject, TokenizedSenetence
from typing import List, Dict, Tuple, Union, TypeVar, Any, Callable
# timeout
from JapaneseTokenizer.common.timeout_handler import on_timeout
from six import text_type
import logging
import sys
import socket
import six
import re
import os
__author__ = 'kensuke-mi'

logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
python_version = sys.version_info
ContentsTypes = TypeVar('T')

try:
    import pyknp
except ImportError:
    logger.warning(msg='pyknp is not ready to use. Install first if you would like to use pyknp wrapper.')


if six.PY2:
    ConnectionRefusedError = Exception 
    class JumanppClient(object):
        """Class for receiving data as client"""
        def __init__(self, hostname, port, timeout=50, option=None):
            # type: (text_type, int, int, Dict[text_type,Any])->None
            try:
                self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                if isinstance(port, text_type):
                    port = int(port)
                self.sock.connect((hostname, port))
            except:
                raise Exception("There is no jumanpp server hostname={}, port={}".format(hostname, port))
            if option is not None:
                self.sock.send(option)
            data = ''
            self.sock.settimeout(timeout)

        def __del__(self):
            if self.sock: self.sock.close()

        def query(self, sentence, pattern):
            # type: (text_type, bytes) -> text_type
            assert (isinstance(sentence, six.text_type))
            data = ''
            self.sock.sendall("%s\n" % sentence.encode('utf-8').strip())
            data = self.sock.recv(1024)
            assert isinstance(data, bytes)
            recv = data
            while not re.search(pattern, recv):
                data = self.sock.recv(1024)
                recv = "%s%s" % (recv, data)
            return recv.strip().decode('utf-8')

else:
    class JumanppClient(object):
        """Class for receiving data as client"""
        def __init__(self, hostname, port, timeout=50, option=None):
            # type: (text_type, int, int, Dict[text_type,Any])->None
            try:
                self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                if isinstance(port, str):
                    port = int(port)
                self.sock.connect((hostname, port))
            except ConnectionRefusedError:
                raise Exception("There is no jumanpp server hostname={}, port={}".format(hostname, port))
            except:
                raise
            if option is not None:
                self.sock.send(option)
            data = b""
            self.sock.settimeout(timeout)

        def __del__(self):
            if self.sock:
                self.sock.close()

        def query(self, sentence, pattern):
            # type: (str, Union[str,bytes]) -> str
            assert (isinstance(sentence, six.text_type))
            if isinstance(pattern, str):
                pattern = pattern.encode('utf-8')
            self.sock.sendall(b"%s\n" % sentence.encode('utf-8').strip())
            data = self.sock.recv(1024)
            assert isinstance(data, bytes)
            recv = data
            while not re.search(pattern, recv):
                data = self.sock.recv(1024)
                recv = b"%s%s" % (recv, data)
            return recv.strip().decode('utf-8')


class JumanppWrapper(WrapperBase):
    """Class for Juman++"""

    def __init__(self,
                 command='jumanpp',
                 timeout=30,
                 pattern=r'EOS',
                 server=None,
                 port=12000,
                 is_use_pyknp = False,
                 ** args):
        # type: (text_type,int,text_type,text_type,bool)
        """* What you can do
        - You can select backend process of jumanpp.
            - jumanpp-pexpect: It calls jumanpp on your local machine. It keeps jumanpp process running.
            - jumanpp-pyknp: It calls jumanpp on your local machine. It launches jumanpp process everytime you call. Thus, this is slower than jumanpp-pexpect
            - jumanpp-server: It calls jumannpp on somewhere else. Keep mind, you have jumanpp sever process somewhere.

        * Parameters
        - timeout: Time to wait from jumanpp process.
        - is_use_pyknp: bool flag to decide if you use pyknp as backend process.  If True; you use pyknp. False; you use pexpect.
        pexpect is much faster than you use pyknp. You can not use pexpect if you're using it on Windowns
        - server: hostname where jumanpp is running
        - port: port number where jumanpp is running
        """
        self.eos_pattern = pattern
        self.is_use_pyknp = is_use_pyknp


        if six.PY2:
            self.dummy_text = 'これはダミーテキストです'.decode('utf-8')
        elif six.PY3:
            self.dummy_text = 'これはダミーテキストです'

        if not server is None:
            pattern = pattern.encode('utf-8')
        else:
            pass

        if os.name == 'nt':
            """It forces to use pyknp if it runs on Windows."""
            if not self.is_use_pyknp:
                logger.warning(msg="You're not able to use pexpect in Windows. It forced to set is_use_pyknp = True")
            else:
                pass
            self.is_use_pyknp = True
        else:
            pass

        if server is None and self.is_use_pyknp:
            # jumanpp-pexpect #
            logger.debug('jumanpp wrapper is initialized with pyknp package')
            self.jumanpp_obj = Juman(
                command=command,
                timeout=timeout,
                pattern=pattern,
                jumanpp=True,
                **args)
        elif server is None:
            # jumanpp-pexpect #
            logger.debug('jumanpp wrapper is initialized with pexpect unix handler')
            self.jumanpp_obj = JumanppHnadler(jumanpp_command=command, timeout_second=timeout, pattern=pattern)  # type: JumanppHnadler
            # put dummy sentence to avoid exception just after command initialization #
            res = self.jumanpp_obj.query(self.dummy_text)
        else:
            # jumanpp-server #
            self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)

    def __del__(self):
        if hasattr(self, "jumanpp_obj"):
            if isinstance(self.jumanpp_obj, JumanppClient):
                self.jumanpp_obj.sock.close()
            elif isinstance(self.jumanpp_obj, JumanppHnadler):
                self.jumanpp_obj.stop_process()
            else:
                del self.jumanpp_obj
        else:
            pass

    def call_juman_interface(self, input_str):
        # type: (text_type) -> MList
        """* What you can do
        - You call Juman tokenizer interface.

        * Output
        - pyknp.MList
        """
        if isinstance(self.jumanpp_obj, Juman):
            ml_token_object = self.jumanpp_obj.analysis(input_str=input_str)
        elif isinstance(self.jumanpp_obj, JumanppHnadler):
            try:
                result_token = self.jumanpp_obj.query(input_string=input_str)
            except ProcessDownException:
                """Unix process is down by any reason."""
                logger.warning("Re-starting unix process because it takes longer time than {} seconds...".format(self.jumanpp_obj.timeout_second))
                self.jumanpp_obj.restart_process()
                self.jumanpp_obj.query(self.dummy_text)
                result_token = self.jumanpp_obj.query(input_string=input_str)
                ml_token_object = MList(result_token)
            except UnicodeDecodeError:
                logger.warning(msg="Process is down by some reason. It restarts process automatically.")
                self.jumanpp_obj.restart_process()
                self.jumanpp_obj.query(self.dummy_text)
                result_token = self.jumanpp_obj.query(input_string=input_str)
                ml_token_object = MList(result_token)
            else:
                ml_token_object = MList(result_token)
        elif isinstance(self.jumanpp_obj, JumanppClient):
            server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern)
            ml_token_object = MList(server_response)
        else:
            raise Exception('Not defined')

        return ml_token_object

    @on_timeout(limit=60)
    def tokenize(self, sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (text_type, bool, bool, bool, bool, Callable[[text_type], text_type]) -> Union[TokenizedSenetence, List[text_type]]
        """* What you can do
        -
        """
        if normalize:
            normalized_sentence = func_normalizer(sentence)
        else:
            normalized_sentence = sentence

        ml_token_object = self.call_juman_interface(normalized_sentence)

        token_objects = [
            juman_utils.extract_morphological_information(
                mrph_object=morph_object,
                is_surface=is_surface,
                is_feature=is_feature
            )
            for morph_object in ml_token_object]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)
            return tokenized_objects

    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
        # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type]) -> FilteredObject
        assert isinstance(parsed_sentence, TokenizedSenetence)
        assert isinstance(pos_condition, (type(None), list))
        assert isinstance(stopwords, (type(None), list))

        return  parsed_sentence.filter(pos_condition, stopwords)


================================================
FILE: JapaneseTokenizer/kytea_wrapper/__init__.py
================================================
__author__ = 'kensuke-mi'
from .kytea_wrapper import KyteaWrapper

================================================
FILE: JapaneseTokenizer/kytea_wrapper/kytea_wrapper.py
================================================
# -*- coding: utf-8 -*-
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common import text_preprocess
from JapaneseTokenizer.datamodels import FilteredObject, TokenizedResult, TokenizedSenetence
from JapaneseTokenizer import init_logger
from typing import List, Tuple, Any, Union, Callable
from six import text_type, string_types
import logging
import sys
import six

logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
python_version = sys.version_info


try:
    import Mykytea
except ImportError:
    logger.warning(msg='Mykytea is not ready to use yet. Install first if you would like to use kytea wrapper.')

__author__ = 'kensuke-mi'


class KyteaWrapper(WrapperBase):
    def __init__(self,
                 option_string='-deftag UNKNOWN!!'):
        # type: (string_types)->None
        # option string is argument of Kytea.
        assert isinstance(option_string, string_types)
        self.kytea = Mykytea.Mykytea(option_string)

    def __list_tags(self, t):
        def convert(t2): return (t2[0], t2[1])
        return [(word.surface, [[convert(t2) for t2 in t1] for t1 in word.tag]) for word in t]

    def __check_char_set(self, input_char):
        # type: (text_type) -> text_type
        if six.PY2 and isinstance(input_char, str):
            return input_char.decode('utf-8')
        elif isinstance(input_char, text_type):
            return input_char
        else:
            raise Exception('nor unicode, str')

    def __extract_morphological_information(self, kytea_tags_tuple, is_feature):
        # type: (Tuple[text_type,List[Any]], bool) -> TokenizedResult
        """This method extracts morphlogical information from token object.
        """
        assert isinstance(kytea_tags_tuple, tuple)
        assert isinstance(is_feature, bool)

        surface = self.__check_char_set(kytea_tags_tuple[0])
        # NOTE: kytea does NOT show word stem. Put blank string instead.
        if six.PY2:
            word_stem = ''.decode('utf-8')
        else:
            word_stem = ''

        pos_tuple = kytea_tags_tuple[1][0]
        pos = self.__check_char_set(pos_tuple[0][0])
        pos_score = float(pos_tuple[0][1])

        yomi_tuple = kytea_tags_tuple[1][1]
        yomi = self.__check_char_set(yomi_tuple[0][0])
        yomi_score = float(yomi_tuple[0][1])

        tuple_pos = (pos, )

        misc_info = {
            'pos_score': pos_score,
            'pos': pos,
            'yomi': yomi,
            'yomi_score': yomi_score
        }

        token_object = TokenizedResult(
            node_obj=None,
            tuple_pos=tuple_pos,
            word_stem=word_stem,
            word_surface=surface,
            is_feature=is_feature,
            is_surface=True,
            misc_info=misc_info
        )

        return token_object

    def call_kytea_tokenize_api(self, sentence):
        """
        """
        result = self.kytea.getTagsToString(sentence)
        assert isinstance(result, text_type)

        return result

    def tokenize(self, sentence,
                 normalize=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=text_preprocess.normalize_text):
        # type: (text_type, bool, bool, bool, bool, Callable[[str],str]) -> Union[List[str], TokenizedSenetence]
        """This method returns tokenized result.
        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
        If return_list==False, this method returns TokenizedSenetence object.
        """
        assert isinstance(normalize, bool)
        assert isinstance(sentence, text_type)
        normalized_sentence = func_normalizer(sentence)
        if six.PY2:
            normalized_sentence = normalized_sentence.encode('utf-8')

        result = self.__list_tags(self.kytea.getTags(normalized_sentence))

        token_objects = [
            self.__extract_morphological_information(
                kytea_tags_tuple=kytea_tags,
                is_feature=is_feature
            )
            for kytea_tags in result]

        if return_list:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects
            )
            return tokenized_objects.convert_list_object()
        else:
            tokenized_objects = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=token_objects)

            return tokenized_objects

    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
        assert isinstance(parsed_sentence, TokenizedSenetence)
        assert isinstance(pos_condition, (type(None), list))
        assert isinstance(stopwords, (type(None), list))

        return parsed_sentence.filter(pos_condition, stopwords, check_field_name='surface')


================================================
FILE: JapaneseTokenizer/mecab_wrapper/__init__.py
================================================
__author__ = 'kensuke-mi'
from .mecab_wrapper import MecabWrapper


================================================
FILE: JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py
================================================
#! -*- coding: utf-8 -*-
# core module
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common.text_preprocess import normalize_text
from JapaneseTokenizer import init_logger
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
import MeCab
# else
import sys
import os
import logging
import subprocess
import six
from six import text_type
# typing
from typing import List, Tuple, Union, TypeVar, Callable
ContentsTypes = TypeVar('T')

__author__ = 'kensuke-mi'

logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
python_version = sys.version_info

try:
    import neologdn
    is_neologdn_valid = True
except:
    logger.warning("neologdn package is not installed yet. You could not call neologd dictionary.")
    is_neologdn_valid = False


class MecabWrapper(WrapperBase):
    def __init__(self,
                 dictType,
                 pathUserDictCsv=None,
                 path_mecab_config=None,
                 path_dictionary=None,
                 string_encoding='utf-8'):
        # type: (text_type, text_type, text_type, text_type, text_type)->None
        """

        :param dictType: a dictionary type called by mecab
        :param pathUserDictCsv: path to your original dictionary file
        :param path_mecab_config: path to 'mecab_config' command. It's automatically detected if not give
        :param path_dictionary: path to a dictionary which you want to use. If not given, it's automatically detected
        :param string_encoding: encoding option to parse command line result. This is mainly used for python2.x
        """
        self.string_encoding = string_encoding
        self._dictType = dictType
        self._pathUserDictCsv = pathUserDictCsv
        self._path_dictionary = path_dictionary
        if path_mecab_config is None:
            self._path_mecab_config = self.__get_path_to_mecab_config()
        else:
            self._path_mecab_config = path_mecab_config

        if self._path_dictionary is not None:
            assert os.path.exists(self._path_dictionary), 'Path dictionary is NOT exist.'
            self._mecab_dictionary_path = None
        else:
            self._mecab_dictionary_path = self.__check_mecab_dict_path()

        logger.info("mecab dictionary path is detected under {}".format(self._mecab_dictionary_path))
        self.mecabObj = self.__CallMecab()

        assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", "jumandic", "unidic", None], \
            'Dictionary Type Error. Your dict = {} is NOT available.'
        if dictType == 'all':
            logger.error('dictionary type "all" is deprecated from version1.6')
            raise Exception('dictionary type "all" is deprecated from version1.6')
        if dictType == 'user':
            logger.error('dictionary type "user" is deprecated from version1.6. You just give path to dictionary csv.')
            raise Exception('dictionary type "all" is deprecated from version1.6. You just give path to dictionary csv.')

        if pathUserDictCsv is not None and isinstance(pathUserDictCsv, text_type) and pathUserDictCsv != '':
            assert os.path.exists(pathUserDictCsv), \
                'Your user dictionary does NOT exist. Path={}'.format(pathUserDictCsv)

    def __get_path_to_mecab_config(self):
        """You get path into mecab-config
        """
        if six.PY2:
            path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config'])
            path_mecab_config_dir = path_mecab_config_dir.strip().replace('/mecab-config', '')
        else:
            path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config']).decode(self.string_encoding)
            path_mecab_config_dir = path_mecab_config_dir.strip().replace('/mecab-config', '')

        logger.info(msg='mecab-config is detected at {}'.format(path_mecab_config_dir))
        return path_mecab_config_dir

    def __check_mecab_dict_path(self):
        """check path to dict of Mecab in system environment
        """
        mecab_dic_cmd = "echo `{} --dicdir`".format(os.path.join(self._path_mecab_config, 'mecab-config'))

        try:
            if six.PY2:
                path_mecab_dict = subprocess.check_output( mecab_dic_cmd, shell=True  ).strip('\n')
            else:
                path_mecab_dict = subprocess.check_output(mecab_dic_cmd, shell=True).decode(self.string_encoding).strip('\n')

        except subprocess.CalledProcessError:
            logger.error("{}".format(mecab_dic_cmd))
            raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config command")
        if path_mecab_dict == '':
            raise SystemError("""mecab dictionary path is not found with following command: {} 
            You are not able to use additional dictionary. 
            Still you are able to call mecab default dictionary""".format(mecab_dic_cmd))

        return path_mecab_dict

    def __check_mecab_libexe(self):
        mecab_libexe_cmd = "echo `{} --libexecdir`".format(os.path.join(self._path_mecab_config, 'mecab-config'))

        try:
            if six.PY2:
                path_mecab_libexe = subprocess.check_output( mecab_libexe_cmd, shell=True  ).strip('\n')
            else:
                path_mecab_libexe = subprocess.check_output(mecab_libexe_cmd, shell=True).decode(self.string_encoding).strip('\n')

        except subprocess.CalledProcessError:
            logger.error("{}".format(mecab_libexe_cmd))
            raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config --libexecdir")
        if path_mecab_libexe == '':
            raise SystemError("""Mecab config is not callable with following command: {} 
            You are not able to compile your user dictionary. 
            Still, you are able to use default mecab dictionary.""".format(mecab_libexe_cmd))

        return path_mecab_libexe

    def __CallMecab(self):
        if self._path_dictionary is not None and self._mecab_dictionary_path is None:
            logger.debug('Use dictionary you specified.')
            cmMecabInitialize = '-d {}'.format(self._path_dictionary)
        elif self._dictType == 'neologd':
            # use neologd
            logger.debug('Use neologd additional dictionary')
            cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "mecab-ipadic-neologd"))
        elif self._dictType == 'ipadic' or self._dictType == 'ipaddic':
            # use ipadic
            logger.debug('Use ipadic dictionary')
            cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "ipadic"))
        elif six.PY2 is False and self._dictType == 'jumandic':
            # use jumandic. This is impossible to call in Python2.x
            logger.debug('Use jumandic dictionary')
            cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "jumandic"))
        elif six.PY2 and self._dictType == 'jumandic':
            raise Exception('In python2.x, impossible to call jumandic.')
        else:
            logger.debug('Use no default dictionary')
            cmMecabInitialize = ''

        # execute compile if user dictionary is given
        if self._pathUserDictCsv is not None:
            logger.debug('Use User dictionary')
            pathUserDict = self.__CompileUserdict()
            cmMecabInitialize += ' -u {}'.format(pathUserDict)

        if six.PY2:
            cmMecabCall = "-Ochasen {}".format(cmMecabInitialize)
        else:
            cmMecabCall = "{}".format(cmMecabInitialize)
        logger.debug(msg="mecab initialized with {}".format(cmMecabCall))

        try:
            mecabObj = MeCab.Tagger(cmMecabCall)
        except Exception as e:
            logger.error(e.args)
            logger.error("Possibly Path to userdict is invalid. Check the path")
            raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to initialize Mecab object")

        return mecabObj

    def __CompileUserdict(self):
        """* What you can do
        """
        path_mecab_dict = self.__check_mecab_dict_path()
        path_mecab_libexe = self.__check_mecab_libexe()

        cmCompileDict = u'{0}/mecab-dict-index -d {1}/ipadic -u {2} -f utf-8 -t utf-8 {3} > /dev/null'.format(path_mecab_libexe,
                                                                                                            path_mecab_dict,
                                                                                                            self._pathUserDictCsv.replace("csv", "dict"),
                                                                                                            self._pathUserDictCsv)
        logger.debug(msg="compiling mecab user dictionary with: {}".format(cmCompileDict))
        try:
            subprocess.call( cmCompileDict , shell=True )
        except OSError as e:
            logger.error('type:' + str(type(e)))
            logger.error('args:' + str(e.args))
            sys.exit('Failed to compile mecab userdict. System ends')

        return self._pathUserDictCsv.replace("csv", "dict")

    def __feature_parser(self, uni_feature, word_surface):
        """
        Parse the POS feature output by Mecab
        :param uni_feature unicode:
        :return ( (pos1, pos2, pos3), word_stem ):
        """
        list_feature_items = uni_feature.split((','))
        # if word has no feature at all
        if len(list_feature_items)==1: return ('*'), ('*')

        pos1 = list_feature_items[0]
        pos2 = list_feature_items[1]
        pos3 = list_feature_items[2]
        tuple_pos = ( pos1, pos2, pos3 )

        # if without constraint(output is normal mecab dictionary like)
        if len(list_feature_items) == 9:
            word_stem = list_feature_items[6]
        # if with constraint(output format depends on Usedict.txt)
        else:
            word_stem = word_surface

        return tuple_pos, word_stem

    def __postprocess_analyzed_result(self, string_mecab_parsed_result, is_feature, is_surface):
        # type: (text_type,bool,bool)->List[TokenizedResult]
        """Extract surface word and feature from analyzed lines.
        Extracted results are returned with list, whose elements are TokenizedResult class
        [TokenizedResult]
        """
        assert isinstance(string_mecab_parsed_result, str)
        check_tab_separated_line = lambda x: True if '\t' in x else False

        tokenized_objects = [
            self.__result_parser(analyzed_line=analyzed_line,
                                 is_feature=is_feature,
                                 is_surface=is_surface)
            for analyzed_line in string_mecab_parsed_result.split('\n')
            if not analyzed_line=='EOS' and check_tab_separated_line(analyzed_line)
        ]

        assert isinstance(tokenized_objects, list)
        return tokenized_objects

    def __result_parser(self, analyzed_line, is_feature, is_surface):
        # type: (text_type,bool,bool)->TokenizedResult
        """Extract surface word and feature from analyzed line.
        Extracted elements are returned with TokenizedResult class
        """
        assert isinstance(analyzed_line, str)
        assert isinstance(is_feature, bool)
        assert isinstance(is_surface, bool)

        surface, features = analyzed_line.split('\t', 1)
        tuple_pos, word_stem = self.__feature_parser(features, surface)
        tokenized_obj = TokenizedResult(
            node_obj=None,
            analyzed_line=analyzed_line,
            tuple_pos=tuple_pos,
            word_stem=word_stem,
            word_surface=surface,
            is_feature=is_feature,
            is_surface=is_surface
        )
        return tokenized_obj

    def tokenize(self, sentence,
                 normalized=True,
                 is_feature=False,
                 is_surface=False,
                 return_list=False,
                 func_normalizer=normalize_text):
        # type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence]
        """* What you can do
        - Call mecab tokenizer, and return tokenized objects

        """
        if six.PY2 and isinstance(sentence, str):
            sentence = sentence.decode(self.string_encoding)
        else:
            pass

        # decide normalization function depending on dictType
        if func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid:
            normalized_sentence = neologdn.normalize(sentence)
        elif func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid == False:
            raise Exception("You could not call neologd dictionary bacause you do NOT install the package neologdn.")
        elif func_normalizer == normalize_text:
            normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType)
        elif func_normalizer is None:
            normalized_sentence = sentence
        else:
            normalized_sentence = func_normalizer(sentence)

        # don't delete this variable. The variable "encoded_text" protects sentence from deleting
        if six.PY2:
            encoded_text = normalized_sentence.encode(self.string_encoding)
        else:
            encoded_text = normalized_sentence

        if six.PY2:
            tokenized_objects = []
            node = self.mecabObj.parseToNode(encoded_text)
            node = node.next
            while node.next is not None:
                word_surface = node.surface.decode(self.string_encoding)

                tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface)

                tokenized_obj = TokenizedResult(
                    node_obj=node,
                    tuple_pos=tuple_pos,
                    word_stem=word_stem,
                    word_surface=word_surface,
                    is_feature=is_feature,
                    is_surface=is_surface
                )
                tokenized_objects.append(tokenized_obj)
                node = node.next

            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects)
        else:
            parsed_result = self.mecabObj.parse(encoded_text)
            tokenized_objects = self.__postprocess_analyzed_result(
                string_mecab_parsed_result=parsed_result,
                is_feature=is_feature,
                is_surface=is_surface
            )
            tokenized_sentence = TokenizedSenetence(
                sentence=sentence,
                tokenized_objects=tokenized_objects
            )  # type: TokenizedSenetence

        if return_list:
            return tokenized_sentence.convert_list_object()
        else:
            return tokenized_sentence

    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
        # type: (TokenizedSenetence, List[Tuple[str,...]], List[str]) -> FilteredObject
        assert isinstance(parsed_sentence, TokenizedSenetence)
        assert isinstance(pos_condition, (type(None), list))
        assert isinstance(stopwords, (type(None), list))
        return parsed_sentence.filter(pos_condition, stopwords)


================================================
FILE: JapaneseTokenizer/object_models.py
================================================
#! -*- coding: utf-8 -*-
from typing import Callable
from six import text_type

class WrapperBase(object):
    def tokenize(self,
                 sentence,
                 normalize,
                 is_feature,
                 is_surface,
                 return_list,
                 func_normalizer=None):
        # type: (text_type, bool, bool, bool, bool, Callable[[text_type], text_type])->None
        """* What you can do"""
        raise NotImplemented

    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
        raise NotImplemented


================================================
FILE: LICENSE.txt
================================================
Copyright 2017 Kensuke Mitsuzawa

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

================================================
FILE: MANIFEST.in
================================================
include README.md
include README_JP.md
include examples
include test
include install_tokenizers.sh
include LICENSE.txt
include Makefile


================================================
FILE: Makefile
================================================
install:
	bash install_tokenizers.sh

install_neologd:
	## mecab-neologdのインストールを実行
	wget --no-check-certificate https://github.com/neologd/mecab-ipadic-neologd/tarball/master -O mecab-ipadic-neologd.tar
	tar -xvf mecab-ipadic-neologd.tar
	mv neologd-mecab-ipadic-neologd-* neologd-mecab-ipadic-neologd && cd neologd-mecab-ipadic-neologd && ( echo yes | ./bin/install-mecab-ipadic-neologd )

================================================
FILE: README.md
================================================
[![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](LICENSE)[![Build Status](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers.svg?branch=master)](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers)


# What's this?

This is simple python-wrapper for Japanese Tokenizers(A.K.A Tokenizer)

This project aims to call tokenizers and split a sentence into tokens as easy as possible.

And, this project supports various Tokenization tools common interface. Thus, it's easy to compare output from various tokenizers.

This project is available also in [Github](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers).  

If you find any bugs, please report them to github issues. Or any pull requests are welcomed!

# Requirements

- Python 2.7
- Python 3.x
    - checked in 3.5, 3.6, 3.7  


# Features

* simple/common interface among various tokenizers
* simple/common interface for filtering with stopwords or Part-of-Speech condition 
* simple interface to add user-dictionary(mecab only)

## Supported Tokenizers

### Mecab

[Mecab](http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html?sess=3f6a4f9896295ef2480fa2482de521f6) is open source tokenizer system for various language(if you have dictionary for it)

See [english documentation](https://github.com/jordwest/mecab-docs-en) for detail

### Juman

[Juman](http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?JUMAN) is a tokenizer system developed by Kurohashi laboratory, Kyoto University, Japan.

Juman is strong for ambiguous writing style in Japanese, and is strong for new-comming words thanks to Web based huge dictionary.
 
And, Juman tells you semantic meaning of words.

### Juman++

[Juman++](http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?JUMAN++) is a tokenizer system developed by Kurohashi laboratory, Kyoto University, Japan.

Juman++ is succeeding system of Juman. It adopts RNN model for tokenization.

Juman++ is strong for ambigious writing style in Japanese, and is strong for new-comming words thanks to Web based huge dictionary.
 
And, Juman tells you semantic meaning of words.

Note: New Juman++ dev-version(later than 2.x) is available at [Github](https://github.com/ku-nlp/jumanpp)


### Kytea

[Kytea](http://www.phontron.com/kytea/) is tokenizer tool developped by Graham Neubig.

Kytea has a different algorithm from one of Mecab or Juman. 

 
# Setting up

## Tokenizers auto-install

```
make install
```

### mecab-neologd dictionary auto-install

```
make install_neologd
```

## Tokenizers manual-install

### MeCab

See [here](https://github.com/jordwest/mecab-docs-en) to install MeCab system.

### Mecab Neologd dictionary

Mecab-neologd dictionary is a dictionary-extension based on ipadic-dictionary, which is basic dictionary of Mecab.

With, Mecab-neologd dictionary, you're able to parse new-coming words make one token.

Here, new-coming words is such like, movie actor name or company name.....

See [here](https://github.com/neologd/mecab-ipadic-neologd) and install mecab-neologd dictionary.

### Juman

```
wget -O juman7.0.1.tar.bz2 "http://nlp.ist.i.kyoto-u.ac.jp/DLcounter/lime.cgi?down=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2&name=juman-7.01.tar.bz2"
bzip2 -dc juman7.0.1.tar.bz2  | tar xvf -
cd juman-7.01
./configure
make   
[sudo] make install
```    
    

## Juman++

* GCC version must be >= 5

```
wget http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz
tar xJvf jumanpp-1.02.tar.xz
cd jumanpp-1.02/
./configure
make
[sudo] make install
```
    
## Kytea

Install Kytea system

```
wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz
tar -xvf kytea-0.4.7.tar
cd kytea-0.4.7
./configure
make
make install
```    


Kytea has [python wrapper](https://github.com/chezou/Mykytea-python) thanks to michiaki ariga.
Install Kytea-python wrapper

```
pip install kytea
```
    

## install

```
[sudo] python setup.py install
```

### Note

During install, you see warning message when it fails to install `pyknp` or `kytea`.

if you see these messages, try to re-install these packages manually.

# Usage

Tokenization Example(For python3.x. To see exmaple code for Python2.x, plaese see [here](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers/blob/master/examples/examples.py))

```
import JapaneseTokenizer
input_sentence = '10日放送の「中居正広のミになる図書館」（テレビ朝日系）で、SMAPの中居正広が、篠原信一の過去の勘違いを明かす一幕があった。'
# ipadic is well-maintained dictionary #
mecab_wrapper = JapaneseTokenizer.MecabWrapper(dictType='ipadic')
print(mecab_wrapper.tokenize(input_sentence).convert_list_object())

# neologd is automatically-generated dictionary from huge web-corpus #
mecab_neologd_wrapper = JapaneseTokenizer.MecabWrapper(dictType='neologd')
print(mecab_neologd_wrapper.tokenize(input_sentence).convert_list_object())
```


## Filtering example

```
import JapaneseTokenizer
# with word filtering by stopword & part-of-speech condition #
print(mecab_wrapper.tokenize(input_sentence).filter(stopwords=['テレビ朝日'], pos_condition=[('名詞', '固有名詞')]).convert_list_object())
```


## Part-of-speech structure

Mecab, Juman, Kytea have different system of Part-of-Speech(POS).

You can check tables of Part-of-Speech(POS) [here](http://www.unixuser.org/~euske/doc/postag/)


# Similar Package


## natto-py

natto-py is sophisticated package for tokenization. It supports following features

* easy interface for tokenization
* importing additional dictionary
* partial parsing mode

# LICENSE

MIT license

# For developers

You could build an environment which has dependencies to test this package.

Simply, you build docker image and run docker container.

## Dev environment

Develop environment is defined with `test/docker-compose-dev.yml`.

With the docker-compose.yml file, you could call python2.7 or python3.7

If you're using Pycharm Professional edition, you could set docker-compose.yml as remote interpreter.

To call python2.7, set `/opt/conda/envs/p27/bin/python2.7`

To call python3.7, set `/opt/conda/envs/p37/bin/python3.7`

## Test environment

These commands checks from procedures of package install until test of package.

```bash
$ docker-compose build
$ docker-compose up
```


================================================
FILE: examples/examples.py
================================================
#! -*- coding: utf-8 -*-
import sys
import os
from JapaneseTokenizer import JumanWrapper
from JapaneseTokenizer import JumanppWrapper
from JapaneseTokenizer import MecabWrapper
from JapaneseTokenizer import KyteaWrapper
from JapaneseTokenizer.datamodels import TokenizedResult
from JapaneseTokenizer import init_logger
import logging
import socket
import six
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
__author__ = 'kensuke-mi'
logger.setLevel(logging.DEBUG)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# for python2.x

def basic_example():
    # ========================================================
    # TOKENIZE
    # ========================================================
    if six.PY2:
        # input is `unicode` type(in python2x)
        sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
    elif six.PY3:
        sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
    else:
        raise Exception()

    # make MecabWrapper object
    # you can choose from "neologd", "all", "ipadic", "user", "", None
    # "ipadic" and "" is equivalent
    mecab_wrapper = MecabWrapper(dictType="neologd")
    juman_wrapper = JumanWrapper()
    jumanpp_wrapper = JumanppWrapper()
    #kytea_wrapper = KyteaWrapper()

    # tokenize sentence into list of token.
    # with is_feature=True, you get part-of-speech tag also. in this case, you get tuple ( token, (part-of-speech-tags) )
    # with is_surface=True, you get surface form of token (in other words, not normalized token)
    seq_tokens_mecab = mecab_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object()
    seq_tokens_juman = juman_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object()
    seq_tokens_jumanpp = jumanpp_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object()
    #seq_tokens_kytea = kytea_wrapper.tokenize(sentence=sentence, is_feature=True, is_surface=False).convert_list_object()

    logger.debug(seq_tokens_mecab)
    logger.debug(seq_tokens_juman)
    logger.debug(seq_tokens_jumanpp)
    #logger.debug(seq_tokens_kytea)

def filtering_example():
    if six.PY2:
        # input is `unicode` type(in python2x)
        sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
        stopwords = [u'テヘラン']
        pos_condition_ipadic = [(u'名詞', u'固有名詞'), (u'名詞', u'一般')]
        pos_condition_juman = [(u'名詞', u'固有名詞'), (u'名詞', u'普通名詞')]
        pos_condition_kytea = [(u'名詞',)]
    elif six.PY3:
        sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
        stopwords = ['テヘラン']
        pos_condition_ipadic = [('名詞', '固有名詞'), ('名詞', '一般')]
        pos_condition_juman = [('名詞', '固有名詞'), ('名詞', '普通名詞')]
        pos_condition_kytea = [('名詞',)]
    else:
        raise Exception()

    # ========================================================
    # FILTERING
    # ========================================================
    # you can filter tokens by stopwords or POS conditions
    # stopword is list objetc

    mecab_wrapper = MecabWrapper(dictType="neologd")
    juman_wrapper = JumanWrapper()
    jumanpp_wrapper = JumanppWrapper()
    #kytea_wrapper = KyteaWrapper()
    seq_tokens_mecab = mecab_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_ipadic,stopwords=stopwords).convert_list_object()
    seq_tokens_juman = juman_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_juman, stopwords=stopwords).convert_list_object()
    seq_tokens_jumanpp = jumanpp_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_juman, stopwords=stopwords).convert_list_object()
    #seq_tokens_kytea = kytea_wrapper.tokenize(sentence=sentence, is_feature=True, is_surface=False).filter(pos_condition=pos_condition_kytea, stopwords=stopwords).convert_list_object()

    logger.debug(seq_tokens_mecab)
    logger.debug(seq_tokens_juman)
    logger.debug(seq_tokens_jumanpp)
    #logger.debug(seq_tokens_kytea)


def advanced_example_mecab():
    if six.PY2:
        sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
    elif six.PY3:
        sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
    else:
        raise Exception()

    # ========================================================
    # USE YOUE OWN DICTIONARY
    # with your own dictionary, you can force Mecab to make some word into one token
    # ========================================================
    # make your own "user dictionary" with CSV file
    # To know more about this file, see this page(sorry, Japanese only) https://mecab.googlecode.com/svn/trunk/mecab/doc/dic.html
    example_user_dict = "userdict.csv"

    # set dictType='user' or dictType='all' and set pathUserDictCsv
    tokenized_obj = MecabWrapper(dictType='user', pathUserDictCsv=example_user_dict).tokenize(sentence)

    for token_obj in tokenized_obj.tokenized_objects:
        assert isinstance(token_obj, TokenizedResult)
        if six.PY2 and token_obj.word_stem == u'ペルシア語':
            logger.debug(token_obj.word_stem)
        elif six.PY3 and token_obj.word_stem == 'ペルシア語':
            logger.debug(token_obj.word_stem)

        ## TokenizedResult class has attributes of tokenized result ##
        token_obj.analyzed_line
        token_obj.word_surface
        token_obj.word_stem
        token_obj.tuple_pos


def advanced_example_juman():
    if six.PY2:
        sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
        pos_condition = [(u'名詞',)]
    elif six.PY3:
        sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
        pos_condition = [('名詞',)]
    else:
        raise Exception()

    ### You can call juman with server mode. You must start JUMAN as server mode beforehand ###
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    HOST='localhost'
    PORT = 32000
    try:
        s.connect((HOST, PORT))
        s.close()
        juman_wrapper = JumanWrapper(server=HOST, port=PORT)
        tokens_list = juman_wrapper.tokenize(sentence, return_list=False).filter(pos_condition=pos_condition).convert_list_object()
        assert isinstance(tokens_list, list)
    except:
        logger.info(msg='Juman server is not running. Skip it.')


if __name__ == "__main__":
    basic_example()
    filtering_example()
    advanced_example_mecab()
    advanced_example_juman()

================================================
FILE: examples/userdict.csv
================================================
ペルシア語,-1,-1,-400,名詞,一般,*,*,*,*,ぺるしあご,*,*,*

================================================
FILE: install_tokenizers.sh
================================================
#!/bin/bash
os_type=`uname`
echo "os-type is "$os_type
if [ `uname` = "Darwin" ]; then
    #mac用のコード
    juman_utils_bin="/usr/local/opt/juman/libexec/juman/"
    if [ -e ${juman_utils_bin} ]; then
        :
    else
        juman_utils_bin="/usr/local/libexec/juman/"
    fi
elif [ `uname` = "Linux" ]; then
    #Linux用のコード
    juman_utils_bin="/usr/local/libexec/juman/"
else
    echo "Your platform ($(uname -a)) is not supported."
    exit 1
fi

WORK_DIR=`pwd`
echo 'これはテスト' | mecab
is_mecab_install=$?

if [ $is_mecab_install -eq 127 ]; then
    ## mecab
    wget -O mecab-0.996.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE"
    tar zxvf mecab-0.996.tar.gz
    cd mecab-0.996 && ./configure && make && make install
    cd $WORK_DIR

    ### mecabインストール後にldconfigを実行
    ldconfig

    ## mecab ipadic
    wget -O mecab-ipadic-2.7.0-20070801.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM"
    tar zxvf mecab-ipadic-2.7.0-20070801.tar.gz
    cd mecab-ipadic-2.7.0-20070801 &&./configure --with-charset=utf8 && make && make install
    # 動作テスト
    echo 'インストール後のテスト' | mecab
else
    :
fi

echo 'これはテスト' | juman
is_juman_install=$?

if [ $is_juman_install -eq 127 ]; then
    ## juman
    wget -O juman7.0.1.tar.bz2 "http://nlp.ist.i.kyoto-u.ac.jp/DLcounter/lime.cgi?down=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2&name=juman-7.01.tar.bz2"
    bzip2 -dc juman7.0.1.tar.bz2  | tar xvf -
    cd juman-7.01 && ./configure && make && make install

    # インストール後のldconfig
    ldconfig
    # 動作テスト
    echo 'インストール後のテスト' | juman
else
    :
fi

echo 'これはテスト' | jumanpp
is_jumanpp_install=$?

if [ $is_jumanpp_install -eq 127 ]; then
    # jumanpp
    wget http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.01.tar.xz
    tar xJvf jumanpp-1.01.tar.xz
    cd jumanpp-1.01/ && ./configure && make && make install
    # todo jumanppのサーバー起動スクリプト実施

    # インストール後のldconfig
    ldconfig
    # 動作テスト
    echo 'インストール後のテスト' | jumanpp
else
    :
fi


echo 'これはテスト' | kytea
is_kytea_install=$?

if [ $is_kytea_install -eq 127 ]; then
    # kytea
    wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz -O kytea-0.4.7.tar.gz
    tar -xvf kytea-0.4.7.tar.gz
    cd kytea-0.4.7 && ./configure && make && make install
    # インストール後のldconfig
    ldconfig
    # 動作テスト
    echo 'インストール後のテスト' | kytea
else
    :
fi


if [ -f ./juman7.0.1.tar.bz2 ]; then
    # juman
	rm juman7.0.1.tar.bz2
else
    :
fi

if [ -f ./mecab-*.tar.gz ]; then
    # juman
	rm mecab-*.tar.gz
else
    :
fi

if [ -f ./mecab-ipadic-*.tar.gz ]; then
	# mecab-ipadic
	rm mecab-ipadic-*.tar.gz
else
    :
fi


if [ -f ./jumanpp-1.01.tar.xz ]; then
	# jumanpp
	rm jumanpp-1.01.tar.xz
else
    :
fi


if [ -f ./kytea-0.4.7.tar ]; then
	# kytea
	rm kytea-0.4.7.tar
else
    :
fi


if [ -d ./juman-7* ]; then
	# kytea
	rm -rf juman-7*
else
    :
fi

if [ -d ./mecab-0* ]; then
	# kytea
	rm -rf mecab-0*
else
    :
fi

if [ -d ./mecab-ipadic-* ]; then
	rm -rf mecab-ipadic-*
else
    :
fi

if [ -d ./jumanpp-1.01 ]; then
	rm -rf jumanpp-1.01
else
    :
fi

if [ -d ./kytea-0.4.7 ]; then
	rm -rf kytea-0.4.7
else
    :
fi

================================================
FILE: setup.py
================================================
#! -*- coding: utf-8 -*-
from setuptools import setup, find_packages
import sys
import logging
import codecs
logger = logging.getLogger(__file__)

python_version = sys.version_info

# --------------------------------------------------------------------------------------------------------
# try to install kytea automatically because it usually causes to error during installing
try:
    import Mykytea
except ImportError:
    try:
        import sys
        import subprocess
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kytea'])
        import Mykytea
    except Exception as e:
        logger.error('We failed to install mykytea automatically. Try installing kytea manually.')
        logger.error(e)

# --------------------------------------------------------------------------------------------------------
try:
    import neologdn
except ImportError:
    try:
        import sys
        import subprocess
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'neologdn'])
        import neologdn
    except Exception as e:
        logger.error('We failed to install neologdn automatically because of some issues in the package. Try installing pyknp manually.')
        logger.error(e)

# --------------------------------------------------------------------------------------------------------

common_packages = ['pypandoc', 'future', 'six', 'jaconv>=0.2', 'pip>=8.1.0', 'pexpect', 'pyknp>=0.4.1']
if python_version >= (3, 0, 0):
    if python_version <= (3, 5, 0):
        common_packages.append('typing')
    elif python_version > (3, 5, 0):
        common_packages.append('mecab-python3')
elif python_version <= (2, 9, 9):
    common_packages.append('typing')
    common_packages.append('mecab-python')
else:
    raise NotImplementedError()

version = '1.6'
name = 'JapaneseTokenizer'
short_description = '`JapaneseTokenizer` is a package for easy Japanese Tokenization'

try:
    import pypandoc
    long_description = pypandoc.convert('README.md', 'rst')
except(IOError, ImportError):
    long_description = codecs.open('README.md', 'r', 'utf-8').read()

classifiers = [
        "Development Status :: 5 - Production/Stable",
        "License :: OSI Approved :: MIT License",
        "Programming Language :: Python",
        "Natural Language :: Japanese",
        "Topic :: Scientific/Engineering :: Artificial Intelligence",
        "Programming Language :: Python :: 2.7",
        "Programming Language :: Python :: 3.5"
        ]

setup(
    author='Kensuke Mitsuzawa',
    author_email='kensuke.mit@gmail.com',
    name = name,
    version=version,
    short_description=short_description,
    long_description=long_description,
    keywords=['MeCab', '和布蕪', 'Juman',
                'Japanese morphological analyzer', 'NLP', '形態素解析', '自然言語処理'],
    license="MIT",
    url = "https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers",
    test_suite='test.test_all.suite',
    install_requires=common_packages,
    tests_require=common_packages,
    packages=find_packages()
)


================================================
FILE: test/Dockerfile
================================================
FROM frolvlad/alpine-glibc:alpine-3.6
MAINTAINER kensuke-mi <kensuke.mit@gmail.com>

# Mecab install
ENV MECAB_VERSION 0.996
ENV IPADIC_VERSION 2.7.0-20070801
ENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE
ENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
ENV build_deps 'curl git bash file sudo openssh gcc make build-base'
ENV dependencies 'openssl'

ENV PATH=/opt/conda/bin:$PATH \
    LANG=C.UTF-8 \
    MINICONDA=Miniconda3-latest-Linux-x86_64.sh
# apk update
RUN apk update

# mecab
RUN apk add --update --no-cache ${build_deps} \
  # Install dependencies
  && apk add --update --no-cache ${dependencies} \
  # Install MeCab
  && curl -SL -o mecab-${MECAB_VERSION}.tar.gz ${mecab_url} \
  && tar zxf mecab-${MECAB_VERSION}.tar.gz \
  && cd mecab-${MECAB_VERSION} \
  && ./configure --enable-utf8-only --with-charset=utf8 \
  && make \
  && make install \
  && cd \
  # Install IPA dic
  && curl -SL -o mecab-ipadic-${IPADIC_VERSION}.tar.gz ${ipadic_url} \
  && tar zxf mecab-ipadic-${IPADIC_VERSION}.tar.gz \
  && cd mecab-ipadic-${IPADIC_VERSION} \
  && ./configure --with-charset=utf8 \
  && make \
  && make install \
  && cd \
  # Install Neologd
  && git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \
  && mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \
  && rm -rf \
    mecab-${MECAB_VERSION}* \
    mecab-${IPADIC_VERSION}* \
    mecab-ipadic-neologd

# general
RUN apk --no-cache add vim \
wget \
lsof \
curl \
bash \
swig \
gcc \
build-base \
make \
python-dev \
py-pip \
jpeg-dev \
zlib-dev \
git \
linux-headers
ENV LIBRARY_PATH=/lib:/usr/lib

ENV PLANTUML_VERSION 1.2017.18
ENV PLANTUML_DOWNLOAD_URL https://sourceforge.net/projects/plantuml/files/plantuml.$PLANTUML_VERSION.jar/download
ENV PANDOC_VERSION 1.19.2.4
ENV PANDOC_DOWNLOAD_URL https://hackage.haskell.org/package/pandoc-$PANDOC_VERSION/pandoc-$PANDOC_VERSION.tar.gz
ENV PANDOC_ROOT /usr/local/pandoc

ENV PATH $PATH:$PANDOC_ROOT/bin

# Create Pandoc build space
RUN mkdir -p /pandoc-build
WORKDIR /pandoc-build

# Install/Build Packages
RUN apk upgrade --update && \
    apk add --no-cache --virtual .build-deps $BUILD_DEPS && \
    apk add --no-cache --virtual .persistent-deps $PERSISTENT_DEPS && \
    curl -fsSL "$PLANTUML_DOWNLOAD_URL" -o /usr/local/plantuml.jar && \
    apk add --no-cache --virtual .edge-deps $EDGE_DEPS -X http://dl-cdn.alpinelinux.org/alpine/edge/community && \
    curl -fsSL "$PANDOC_DOWNLOAD_URL" | tar -xzf - && \
        ( cd pandoc-$PANDOC_VERSION && cabal update && cabal install --only-dependencies && \
        cabal configure --prefix=$PANDOC_ROOT && \
        cabal build && \
        cabal copy && \
        cd .. ) && \
    rm -Rf pandoc-$PANDOC_VERSION/ && \
    rm -Rf /root/.cabal/ /root/.ghc/ && \
    rmdir /pandoc-build && \
    set -x; \
    addgroup -g 82 -S www-data; \
    adduser -u 82 -D -S -G www-data www-data && \
    mkdir -p /var/docs && \
    apk del .build-deps .edge-deps

# Juman
RUN wget http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2 \
    && tar xvf juman-7.01.tar.bz2 \
    && cd juman-7.01 \
    && ./configure \
    && make \
    && make install \
    && cd .. \
    && rm -rf juman-7.01 \
    && rm juman-7.01.tar.bz2

# Juman++
RUN apk add --update --no-cache --virtual=build-deps \
    boost-dev g++ make \
    && wget -q http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz \
    && tar Jxfv jumanpp-1.02.tar.xz \
    && cd jumanpp-1.02/ \
    && ./configure \
    && make \
    && make install \
    && cd .. \
    && rm jumanpp-1.02.tar.xz \
    && rm -rf /var/cache/* \
    && apk del build-deps \
    && apk add --update --no-cache boost

# kytea
RUN wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz \
    && tar -xvf kytea-0.4.7.tar.gz \
    && cd kytea-0.4.7 \
    && ./configure \
    && make \
    && make install

# Python
RUN apk add --no-cache bash wget && \
    wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \
    bash $MINICONDA -b -p /opt/conda && \
    ln -s /opt/conda/bin/* /usr/local/bin/ && \
    rm -rf /root/.[acpw]* $MINICONDA /opt/conda/pkgs/*

RUN conda config --add channels conda-forge --system
RUN conda create -y -n p27 python=2.7
RUN conda create -y -n p36 python=3.6
RUN conda create -y -n p37 python=3.7

#RUN source activate p27
#RUN source deactivate

CMD ["/bin/bash"]

================================================
FILE: test/Dockerfile-dev
================================================
FROM frolvlad/alpine-glibc:alpine-3.6
MAINTAINER kensuke-mi <kensuke.mit@gmail.com>

# Mecab install
ENV MECAB_VERSION 0.996
ENV IPADIC_VERSION 2.7.0-20070801
ENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE
ENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
ENV jumandic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM
ENV unidic_url https://unidic.ninjal.ac.jp/unidic_archive/cwj/2.3.0/unidic-cwj-2.3.0.zip
ENV build_deps 'curl git bash file sudo openssh gcc make build-base'
ENV dependencies 'openssl'

ENV PATH=/opt/conda/bin:$PATH \
    LANG=C.UTF-8 \
    MINICONDA=Miniconda3-latest-Linux-x86_64.sh
# apk update
RUN apk update

# mecab
RUN apk add --update --no-cache ${build_deps} \
  # Install dependencies
  && apk add --update --no-cache ${dependencies} \
  # Install MeCab
  && curl -SL -o mecab-${MECAB_VERSION}.tar.gz ${mecab_url} \
  && tar zxf mecab-${MECAB_VERSION}.tar.gz \
  && cd mecab-${MECAB_VERSION} \
  && ./configure --enable-utf8-only --with-charset=utf8 \
  && make \
  && make install \
  && cd \
  # Install IPA dic
  && curl -SL -o mecab-ipadic-${IPADIC_VERSION}.tar.gz ${ipadic_url} \
  && tar zxf mecab-ipadic-${IPADIC_VERSION}.tar.gz \
  && cd mecab-ipadic-${IPADIC_VERSION} \
  && ./configure --with-charset=utf8 \
  && make \
  && make install \
  && cd \
  # Install Neologd
  && git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \
  && mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \
  # Install jumandic
  && curl -SL -o jumandic.tar.gz ${jumandic_url} \
  && tar zxf jumandic.tar.gz \
  && cd mecab-jumandic-7.0-20130310 \
  && ./configure --with-charset=utf8 \
  && make \
  && make install \
  # delete dictionary files
  && cd \
  && rm -rf \
    mecab-${MECAB_VERSION}* \
    mecab-${IPADIC_VERSION}* \
    mecab-ipadic-neologd \
    mecab-jumandic-7.0-20130310

# general
RUN apk --no-cache add vim \
wget \
lsof \
curl \
bash \
swig \
gcc \
build-base \
make \
python-dev \
py-pip \
jpeg-dev \
zlib-dev \
git \
linux-headers
ENV LIBRARY_PATH=/lib:/usr/lib

ENV PLANTUML_VERSION 1.2017.18
ENV PLANTUML_DOWNLOAD_URL https://sourceforge.net/projects/plantuml/files/plantuml.$PLANTUML_VERSION.jar/download
ENV PANDOC_VERSION 1.19.2.4
ENV PANDOC_DOWNLOAD_URL https://hackage.haskell.org/package/pandoc-$PANDOC_VERSION/pandoc-$PANDOC_VERSION.tar.gz
ENV PANDOC_ROOT /usr/local/pandoc

ENV PATH $PATH:$PANDOC_ROOT/bin

# Create Pandoc build space
RUN mkdir -p /pandoc-build
WORKDIR /pandoc-build

# Install/Build Packages
RUN apk upgrade --update && \
    apk add --no-cache --virtual .build-deps $BUILD_DEPS && \
    apk add --no-cache --virtual .persistent-deps $PERSISTENT_DEPS && \
    curl -fsSL "$PLANTUML_DOWNLOAD_URL" -o /usr/local/plantuml.jar && \
    apk add --no-cache --virtual .edge-deps $EDGE_DEPS -X http://dl-cdn.alpinelinux.org/alpine/edge/community && \
    curl -fsSL "$PANDOC_DOWNLOAD_URL" | tar -xzf - && \
        ( cd pandoc-$PANDOC_VERSION && cabal update && cabal install --only-dependencies && \
        cabal configure --prefix=$PANDOC_ROOT && \
        cabal build && \
        cabal copy && \
        cd .. ) && \
    rm -Rf pandoc-$PANDOC_VERSION/ && \
    rm -Rf /root/.cabal/ /root/.ghc/ && \
    rmdir /pandoc-build && \
    set -x; \
    addgroup -g 82 -S www-data; \
    adduser -u 82 -D -S -G www-data www-data && \
    mkdir -p /var/docs && \
    apk del .build-deps .edge-deps

# Juman
RUN wget http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2 \
    && tar xvf juman-7.01.tar.bz2 \
    && cd juman-7.01 \
    && ./configure \
    && make \
    && make install \
    && cd .. \
    && rm -rf juman-7.01 \
    && rm juman-7.01.tar.bz2

# Juman++
RUN apk add --update --no-cache --virtual=build-deps \
    boost-dev g++ make \
    && wget -q http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz \
    && tar Jxfv jumanpp-1.02.tar.xz \
    && cd jumanpp-1.02/ \
    && ./configure \
    && make \
    && make install \
    && cd .. \
    && rm jumanpp-1.02.tar.xz \
    && rm -rf /var/cache/* \
    && apk del build-deps \
    && apk add --update --no-cache boost

# kytea
RUN wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz \
    && tar -xvf kytea-0.4.7.tar.gz \
    && cd kytea-0.4.7 \
    && ./configure \
    && make \
    && make install

# Python
RUN apk add --no-cache bash wget && \
    wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \
    bash $MINICONDA -b -p /opt/conda && \
    ln -s /opt/conda/bin/* /usr/local/bin/ && \
    rm -rf /root/.[acpw]* $MINICONDA /opt/conda/pkgs/*

RUN conda config --add channels conda-forge --system
RUN conda create -y -n p27 python=2.7
RUN conda create -y -n p37 python=3.7

RUN mkdir /code
RUN mkdir /code/dev
COPY requirements_py2.txt /code/dev/requirements_py2.txt
COPY requirements_py3.txt /code/dev/requirements_py3.txt

RUN source activate p27 && pip install -r /code/dev/requirements_py2.txt
RUN source deactivate

RUN source activate p37 && pip install -r /code/dev/requirements_py3.txt
RUN source deactivate

CMD ["/bin/bash"]

================================================
FILE: test/__init__.py
================================================
__author__ = 'kensuke-mi'


================================================
FILE: test/common/__init__.py
================================================


================================================
FILE: test/common/test_server_handler.py
================================================
#! -*- coding: utf-8 -*-
# test module
from JapaneseTokenizer.common import sever_handler
# client module
import six
if six.PY2:
    from JapaneseTokenizer.jumanpp_wrapper.__jumanpp_wrapper_python2 import JumanppWrapper
else:
    from JapaneseTokenizer.jumanpp_wrapper.__jumanpp_wrapper_python3 import JumanppWrapper
# else
import sys
import unittest
import os
import time

__author__ = 'kensuke-mi'


class TestServerHandler(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        if six.PY3:
            cls.test_senetence = '紗倉 まな（さくらまな、1993年3月23日 - ）は、日本のAV女優。'
        else:
            cls.test_senetence = u'紗倉 まな（さくらまな、1993年3月23日 - ）は、日本のAV女優。'

        cls.jumanpp_command = "/usr/local/bin/jumanpp"


    def test_jumanpp_process_hanlder_normal(self):
        """It tests jumanpp process handler"""
        # normal test #
        jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command)
        result_jumanpp_analysis = jumanpp_process_handler.query(input_string=self.test_senetence)
        self.assertTrue(isinstance(result_jumanpp_analysis,six.text_type))
        ## stop process ##
        jumanpp_process_handler.stop_process()
        ## delete instance ##
        del jumanpp_process_handler

    def test_jumanpp_process_handler_timeout_exception(self):
        """It tests the case which causes timeout exception"""
        with self.assertRaises(Exception) as exc:
            jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command,
                                                                   timeout_second=1)
            result_jumanpp_analysis = jumanpp_process_handler.query(input_string=self.test_senetence*100)
        exception_message = exc.exception
        jumanpp_process_handler.stop_process()

    def test_jumanpp_process_handler_init_exception(self):
        with self.assertRaises(Exception) as exc:
            jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command='hoge',
                                                                   timeout_second=1)
        exception_message = exc.exception

    def test_jumanpp_process_handler_huge_request(self):
        """It tests the case where a user sends too much request"""
        input_huge_request = [self.test_senetence] * 100
        jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command)
        seq_result_jumanpp_analysis = [jumanpp_process_handler.query(input_string=sentence)
                                       for sentence in input_huge_request]
        self.assertTrue(isinstance(seq_result_jumanpp_analysis, list))


if __name__ == '__main__':
    unittest.main()

================================================
FILE: test/docker-compose-dev.yml
================================================
# 開発/test環境としてまとめてdocker環境を整えるためのcompose
version: '3'
services:
  dev_env_py2:
    build:
      context: ./
      dockerfile: Dockerfile-dev
    volumes:
    - ..:/codes/
    stdin_open: true
    tty: true
    command: bash -c "source /opt/conda/bin/activate p27 && pip install -r requirements_py2.txt"
  dev_env_py3:
    build:
      context: ./
      dockerfile: Dockerfile
    volumes:
    - ..:/codes/
    stdin_open: true
    tty: true
    command: bash -c "source /opt/conda/bin/activate p37 && pip install -r requirements_py3.txt"

================================================
FILE: test/docker-compose.yml
================================================
# 開発/test環境としてまとめてdocker環境を整えるためのcompose
version: '3'
services:
  test_env:
    build:
      context: ./
      dockerfile: Dockerfile
    volumes:
      - ..:/codes/
    stdin_open: true
    tty: true
    command: bash -c "juman -S && source /opt/conda/bin/activate p37 && cd /codes/ && python setup.py test && source deactivate && echo 'Python3 test done' && source /opt/conda/bin/activate p27 && cd /codes/ && python setup.py test && echo 'Python2 test done'"

================================================
FILE: test/requirements_py2.txt
================================================
pypandoc
future
six
jaconv>=0.2
pip>=8.1.0
pexpect
pyknp>=0.4.1
mecab-python
typing
neologdn
kytea

================================================
FILE: test/requirements_py3.txt
================================================
pypandoc
future
six
jaconv>=0.2
pip>=8.1.0
pexpect
pyknp
mecab-python3
neologdn
kytea

================================================
FILE: test/resources/test/userdict.csv
================================================
さくらまな,-1,-1,-400,名詞,一般,*,*,*,*,さくらまな,*,*,*

================================================
FILE: test/test_all.py
================================================
__author__ = 'kensuke-mi'

import sys
import unittest
import six
python_version = sys.version_info


def suite():
    suite = unittest.TestSuite()
    if six.PY3:
        from .test_filter_python3 import TestFilter
        from .test_mecab_wrapper_python3 import TestMecabWrapperPython3
        from .test_kytea_wrapper_python3 import TestKyteaWrapperPython3
        from .test_juman_wrapper_python3 import TestJumanWrapperPython3
        suite.addTest(unittest.makeSuite(TestFilter))
        suite.addTest(unittest.makeSuite(TestKyteaWrapperPython3))
        suite.addTest(unittest.makeSuite(TestMecabWrapperPython3))
        suite.addTest(unittest.makeSuite(TestJumanWrapperPython3))
    elif six.PY2:
        from .test_filter_python2 import TestFilter
        from .test_mecab_wrapper_python2 import TestMecabWrapperPython2
        from .test_juman_wrapper_python2 import TestJumanWrapperPython2
        from .test_kytea_wrapper_python2 import TestKyteaWrapperPython2
        suite.addTest(unittest.makeSuite(TestFilter))
        suite.addTest(unittest.makeSuite(TestKyteaWrapperPython2))
        suite.addTest(unittest.makeSuite(TestMecabWrapperPython2))
        suite.addTest(unittest.makeSuite(TestJumanWrapperPython2))

    return suite


def suite_with_jumanpp():
    suite_obj = suite()
    if six.PY3:
        from .test_jumanpp_wrapper_python3 import TestJumanppWrapperPython3
        suite_obj.addTest(suite_obj.addTest(unittest.makeSuite(TestJumanppWrapperPython3)))
    elif six.PY2:
        from .test_jumanpp_wrapper_python2 import TestJumanppWrapperPython2
        suite_obj.addTest(suite_obj.addTest(unittest.makeSuite(TestJumanppWrapperPython2)))

    return suite_obj

================================================
FILE: test/test_filter_python2.py
================================================
#! -*- coding: utf-8 -*-
import sys
import unittest
from JapaneseTokenizer.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence, FilteredObject, TokenizedResult
import os
__author__ = 'kensuke-mi'


class TestFilter(unittest.TestCase):
    def setUp(self):
        '''紗倉 まな（さくらまな、１９９３年３月２３日 - ）は、日本のAV女優みたいだ。'''
        self.test_senetence = u'紗倉 まなは、日本のAV女優みたいで、うつくしい。\nそこで、ぼくはその１枚のはなやかな作品を見たいと思った。'
        self.stopword = ['AV']
        self.pos_condition = [('名詞', '一般',), ('名詞', '固有名詞'), ('形容詞', '自立',), ('助詞', '格助詞', '引用')]
        self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')

    def test_filtering(self):
        mecab_obj = MecabWrapper(dictType='ipadic')
        tokenized_sentence = mecab_obj.tokenize(sentence=self.test_senetence,is_feature=True).\
            filter(pos_condition=self.pos_condition, stopwords=self.stopword)
        assert isinstance(tokenized_sentence, TokenizedSenetence)

        seq_except_pos = [(u'動詞',), (u'名詞', u'代名詞'), (u'名詞', u'接尾')]
        seq_match_pos = [(u'名詞',), (u'名詞', u'固有名詞',), (u'形容詞',), (u'形容詞', u'自立'),(u'助詞', u'格助詞', u'引用')]

        for token_obj in tokenized_sentence.tokenized_objects:
            assert isinstance(token_obj, TokenizedResult)

            pos_tuple = token_obj.tuple_pos
            # 結果に入っているべきではない品詞 #
            for except_pos in seq_except_pos:
                self.assertTrue(not set(except_pos).issubset(set(pos_tuple)))
            # 結果に入っているべき品詞 #
            bool_any = any(set(match_pos).issubset(set(pos_tuple)) for match_pos in seq_match_pos)
            self.assertTrue(bool_any)


if __name__ == '__main__':
    unittest.main()

================================================
FILE: test/test_filter_python3.py
================================================
#! -*- coding: utf-8 -*-
import sys
import unittest
from JapaneseTokenizer.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence, FilteredObject, TokenizedResult
import os
__author__ = 'kensuke-mi'


class TestFilter(unittest.TestCase):
    def setUp(self):
        '''紗倉 まな（さくらまな、１９９３年３月２３日 - ）は、日本のAV女優みたいだ。'''
        self.test_senetence = '紗倉 まなは、日本のAV女優みたいで、うつくしい。そこで、ぼくはその１枚のはなやかな作品を見たいと思った。'
        self.stopword = ['AV', '女優']
        self.pos_condition = [('名詞', '一般',), ('名詞', '固有名詞'), ('形容詞', '自立',), ('助詞', '格助詞', '引用')]
        self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')

    def test_filtering(self):
        mecab_obj = MecabWrapper(dictType='ipadic')
        tokenized_sentence = mecab_obj.tokenize(sentence=self.test_senetence,is_feature=True).\
            filter(pos_condition=self.pos_condition, stopwords=self.stopword)
        assert isinstance(tokenized_sentence, TokenizedSenetence)

        seq_except_pos = [('動詞',), ('名詞', '代名詞'), ('名詞', '接尾')]
        seq_match_pos = [('名詞',), ('名詞', '固有名詞',), ('形容詞',), ('形容詞', '自立'),('助詞', '格助詞', '引用')]

        for token_obj in tokenized_sentence.tokenized_objects:
            assert isinstance(token_obj, TokenizedResult)

            pos_tuple = token_obj.tuple_pos
            # 結果に入っているべきではない品詞 #
            for except_pos in seq_except_pos:
                self.assertTrue(not set(except_pos).issubset(set(pos_tuple)))
            # 結果に入っているべき品詞 #
            bool_any = any(set(match_pos).issubset(set(pos_tuple)) for match_pos in seq_match_pos)
            self.assertTrue(bool_any)

            # stopwordsのチェック
            self.assertTrue(token_obj.word_stem not in self.stopword)


if __name__ == '__main__':
    unittest.main()

================================================
FILE: test/test_juman_wrapper_python2.py
================================================
#-*- encoding: utf-8 -*-
# this test file does not work under pycharm
# do your test with command line
from __future__ import absolute_import
from __future__ import division
from future.utils import string_types, text_type
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
from JapaneseTokenizer.juman_wrapper import JumanWrapper
import pyknp
import unittest
import sys
import codecs
import logging
sys.stdin = codecs.getreader('utf_8')(sys.stdin)
sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
logger = logging.getLogger(__file__)
logger.level = logging.INFO


class TestJumanWrapperPython2(unittest.TestCase):
    def setUp(self):
        pass

    def test_juman_wrapper(self):
        try:
            from pyknp import Juman

            juman = Juman(command='juman', jumanpp=False)
            result = juman.analysis(u"これはペンです。")
            logger.debug(','.join(mrph.midasi for mrph in result))

            for mrph in result.mrph_list():
                assert isinstance(mrph, pyknp.Morpheme)
                logger.debug(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                  % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
        except ImportError:
            logger.debug('skip test_juman_wrapper')

    def test_tokenize(self):
        """This test case checks juman_wrapper.tokenize
        """

        logger.debug (u'Tokenize Test')
        test_sentence = u"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        juman_wrapper = JumanWrapper()
        token_objects = juman_wrapper.tokenize(sentence=test_sentence,
                                               return_list=False,
                                               is_feature=True)

        assert isinstance(token_objects, TokenizedSenetence)
        for t_obj in token_objects.tokenized_objects:
            assert isinstance(t_obj, TokenizedResult)
            logger.debug(u"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format(
                t_obj.word_surface,
                t_obj.word_stem,
                ' '.join(t_obj.tuple_pos),
                t_obj.misc_info
            ))
            assert isinstance(t_obj.word_surface, string_types)
            assert isinstance(t_obj.word_stem, string_types)
            assert isinstance(t_obj.tuple_pos, tuple)
            assert isinstance(t_obj.misc_info, dict)

        token_objects_list = token_objects.convert_list_object()
        assert isinstance(token_objects_list, list)
        logger.debug('-'*30)
        for stem_posTuple in token_objects_list:
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, string_types)
            assert isinstance(word_posTuple, tuple)

            logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))

    def test_filter_pos(self):
        """
        """
        logger.debug (u'Filtering Test. POS condition is only 名詞')
        test_sentence = u"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        juman_wrapper = JumanWrapper()
        token_objects = juman_wrapper.tokenize(sentence=test_sentence,
                                               return_list=False,
                                               is_feature=True
                                               )
        pos_condition = [(u'名詞', )]
        filtered_result = juman_wrapper.filter(
            parsed_sentence=token_objects,
            pos_condition=pos_condition
        )

        assert isinstance(filtered_result, FilteredObject)
        for t_obj in filtered_result.tokenized_objects:
            assert isinstance(t_obj, TokenizedResult)
            logger.debug(u"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format(
                t_obj.word_surface,
                t_obj.word_stem,
                ' '.join(t_obj.tuple_pos),
                t_obj.misc_info
            ))
            assert isinstance(t_obj.word_surface, string_types)
            assert isinstance(t_obj.word_stem, string_types)
            assert isinstance(t_obj.tuple_pos, tuple)
            assert isinstance(t_obj.misc_info, dict)

            assert t_obj.tuple_pos[0] == u'名詞'

        logger.debug('-'*30)
        for stem_posTuple in filtered_result.convert_list_object():
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, string_types)
            assert isinstance(word_posTuple, tuple)

            logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))

    def test_stopwords(self):
        stopword = [u'ＡＶ', u'女優']
        logger.debug (u'Stopwords Filtering Test. Stopwords is {}'.format(u','.join(stopword)))
        test_sentence = u"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        juman_wrapper = JumanWrapper()
        token_objects = juman_wrapper.tokenize(sentence=test_sentence,
                                               return_list=False,
                                               is_feature=True
                                               )
        filtered_result = juman_wrapper.filter(
            parsed_sentence=token_objects,
            stopwords=stopword
        )

        check_flag = True
        for stem_posTuple in filtered_result.convert_list_object():
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, string_types)
            assert isinstance(word_posTuple, tuple)

            logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))
            if word_stem in stopword: check_flag = False
        assert check_flag

    def test_juman_server_mode(self):
        ### test with server mode ###

        ### Attention: this method causes Error if you don't start JUMAN SERVER mode ###
        test_sentence = u"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        juman_wrapper = JumanWrapper(server='localhost', port=32000)
        token_objects = juman_wrapper.tokenize(sentence=test_sentence,
                                               return_list=False,
                                               is_feature=True)
        self.assertTrue(isinstance(token_objects, TokenizedSenetence))


        list_tokens = juman_wrapper.tokenize(sentence=test_sentence,
                                               return_list=True,
                                               is_feature=True)
        self.assertTrue(isinstance(list_tokens, list))


if __name__ == '__main__':
    unittest.main()

================================================
FILE: test/test_juman_wrapper_python3.py
================================================
#-*- encoding: utf-8 -*-
# this test file does not work under pycharm
# do your test with command line
from pyknp import Juman
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
from JapaneseTokenizer.juman_wrapper import JumanWrapper
import pyknp
import unittest
import os
import logging
import socket
logger = logging.getLogger(__file__)
logger.level = logging.INFO


class TestJumanWrapperPython3(unittest.TestCase):
    def setUp(self):
        # this is under MacOSX10
        self.path_to_juman_command = '/usr/local/bin/juman'
        if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'juman'

    def test_juman_wrapper(self):
        try:
            juman = Juman(command=self.path_to_juman_command)
            result = juman.analysis("これはペンです。")
            logger.debug(','.join(mrph.midasi for mrph in result))

            for mrph in result.mrph_list():
                assert isinstance(mrph, pyknp.Morpheme)
                logger.debug("見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
                      % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
        except ImportError:
            print('skip test_juman_wrapper')

    def test_tokenize(self):
        """This test case checks juman_wrapper.tokenize
        """
        logger.debug('Tokenize Test')
        test_sentence = "紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        juman_wrapper = JumanWrapper(command=self.path_to_juman_command)
        token_objects = juman_wrapper.tokenize(sentence=test_sentence,
                                               return_list=False,
                                               is_feature=True)

        assert isinstance(token_objects, TokenizedSenetence)
        for t_obj in token_objects.tokenized_objects:
            assert isinstance(t_obj, TokenizedResult)
            logger.debug("word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format(
                t_obj.word_surface,
                t_obj.word_stem,
                ' '.join(t_obj.tuple_pos),
                t_obj.misc_info
            ))
            assert isinstance(t_obj.word_surface, str)
            assert isinstance(t_obj.word_stem, str)
            assert isinstance(t_obj.tuple_pos, tuple)
            assert isinstance(t_obj.misc_info, dict)

        token_objects_list = token_objects.convert_list_object()
        assert isinstance(token_objects_list, list)
        logger.debug('-'*30)
        for stem_posTuple in token_objects_list:
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, str)
            assert isinstance(word_posTuple, tuple)

            logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))

    def test_filter_pos(self):
        """POS filteringのテスト
        """
        logger.debug('Filtering Test. POS condition is only 名詞')
        test_sentence = "紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        juman_wrapper = JumanWrapper(command=self.path_to_juman_command)
        token_objects = juman_wrapper.tokenize(sentence=test_sentence,
                                               return_list=False,
                                               is_feature=True)
        pos_condition = [('名詞', )]
        filtered_result = juman_wrapper.filter(
            parsed_sentence=token_objects,
            pos_condition=pos_condition
        )

        assert isinstance(filtered_result, FilteredObject)
        for t_obj in filtered_result.tokenized_objects:
            assert isinstance(t_obj, TokenizedResult)
            logger.debug("word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format(
                t_obj.word_surface,
                t_obj.word_stem,
                ' '.join(t_obj.tuple_pos),
                t_obj.misc_info
            ))
            assert isinstance(t_obj.word_surface, str)
            assert isinstance(t_obj.word_stem, str)
            assert isinstance(t_obj.tuple_pos, tuple)
            assert isinstance(t_obj.misc_info, dict)

            assert t_obj.tuple_pos[0] == '名詞'

        logger.debug('-'*30)
        for stem_posTuple in filtered_result.convert_list_object():
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, str)
            assert isinstance(word_posTuple, tuple)

            logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))

    def test_stopwords(self):
        """stopword除去のテスト"""
        stopword = ['ＡＶ', '女優']
        logger.debug ('Stopwords Filtering Test. Stopwords is {}'.format(','.join(stopword)))
        test_sentence = "紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        juman_wrapper = JumanWrapper(command=self.path_to_juman_command)
        token_objects = juman_wrapper.tokenize(sentence=test_sentence,
                                               return_list=False,
                                               is_feature=True
                                               )
        filtered_result = juman_wrapper.filter(
            parsed_sentence=token_objects,
            stopwords=stopword
        )

        check_flag = True
        for stem_posTuple in filtered_result.convert_list_object():
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, str)
            assert isinstance(word_posTuple, tuple)

            logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))
            if word_stem in stopword: check_flag = False
        assert check_flag

    def test_juman_severmode(self):
        """* What you can do
        - juman server modeのテストを実施する
        """
        logger.debug('Tokenize test with server mode')
        test_sentence = "紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        # check socket
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        HOST = 'localhost'
        PORT = 32000
        try:
            s.connect((HOST, PORT))
            s.close()
        except:
            logger.warning("SKip server mode test because server is not working.")
        else:
            juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT)
            token_objects = juman_wrapper.tokenize(sentence=test_sentence,
                                                   return_list=False,
                                                   is_feature=True)
            assert isinstance(token_objects, TokenizedSenetence)

            test_sentence = "ペルシア語（ペルシアご、ペルシア語: فارسی‌‎, پارسی‌; Fārsī, Pārsī）は、イランを中心とする中東地域で話される言語。"
            juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT)
            list_token = juman_wrapper.tokenize(sentence=test_sentence,
                                                   return_list=True,
                                                   is_feature=True)
            assert isinstance(list_token, list)


if __name__ == '__main__':
    unittest.main()

================================================
FILE: test/test_jumanpp_wrapper_python2.py
================================================
#-*- encoding: utf-8 -*-
# this test file does not work under pycharm
# do your test with command line
from pyknp import Juman
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
from JapaneseTokenizer.jumanpp_wrapper.jumanpp_wrapper import JumanppWrapper, JumanppClient
from JapaneseTokenizer.common.sever_handler import JumanppHnadler
import pyknp
import socket
import unittest
import os
import logging
logger = logging.getLogger(__file__)
logger.level = logging.INFO


class TestJumanppWrapperPython2(unittest.TestCase):
    def setUp(self):
        # this is under MacOSX10
        self.path_to_juman_command = '/usr/local/bin/jumanpp'
        if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'jumanpp'

    def test_JumanppClient(self):
        test_sentence = u'外国人参政権を欲しい。'
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        HOST = 'localhost'
        PORT = 12000
        try:
            s.connect((HOST, PORT))
            s.close()
        except:
            logger.warning("SKip server mode test because server is not working.")
        else:
            client_obj = JumanppClient(hostname='localhost', port=12000)
            res = client_obj.query(sentence=test_sentence, pattern=r'EOS')
            del res

    def test_jumanpp_servermode(self):
        ### test with list return object ###
        test_sentence = u'外国人参政権を欲しい。'
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        HOST = 'localhost'
        PORT = 12000
        try:
            s.connect((HOST, PORT))
            s.close()
        except:
            logger.warning("SKip server mode test because server is not working.")
        else:
            jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000)
            list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
            assert isinstance(list_tokens, list)

            ### test with TokenizedSenetence return object ###
            tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
            assert isinstance(tokenized_obj, TokenizedSenetence)

            ### test with TokenizedSenetence return object and filter by chain expression ###
            pos_condtion = [('名詞', )]
            filtered_res = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False).filter(pos_condition=pos_condtion)
            assert isinstance(filtered_res, FilteredObject)
            assert isinstance(filtered_res.convert_list_object(), list)

    def test_jumanpp_servermode_stress(self):
        ### test with severmode with much stress ###
        test_sentence = u'外国人参政権を欲しい。'
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        HOST = 'localhost'
        PORT = 12000
        try:
            s.connect((HOST, PORT))
            s.close()
        except:
            logger.warning("SKip server mode test because server is not working.")
        else:
            jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000)
            for i in range(0, 1000):
                list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
                assert isinstance(list_tokens, list)
                assert u'外国' in test_sentence
            del jumanpp_tokenizer


    def test_jumanpp_localmode_pyexpect(self):
        test_sentence = u'外国人参政権を欲しい。'
        jumanpp_tokenizer = JumanppWrapper(command=self.path_to_juman_command, is_use_pyknp=False)
        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
        list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
        assert isinstance(list_tokens, list)

        jumanpp_tokenizer = JumanppWrapper(command=self.path_to_juman_command, is_use_pyknp=False)
        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
        tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
        assert isinstance(tokenized_obj, TokenizedSenetence)

    def test_jumanpp_huge_amount_text(self):
        """pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動"""
        logger.info('under testing of processing huge amount of text...')
        seq_test_sentence = [u'外国人参政権を欲しい。'] * 500
        jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
        for i, test_s in enumerate(seq_test_sentence):
            tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s)
            self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence))
            if not i == 0 and i % 100 == 0:
                """強制的にプロセスを殺して再起動"""
                logger.info('It forces stop unix process.')
                jumanpp_tokenizer.jumanpp_obj.restart_process()
        else:
            pass


if __name__ == '__main__':
    unittest.main()

================================================
FILE: test/test_jumanpp_wrapper_python3.py
================================================
#-*- encoding: utf-8 -*-
# this test file does not work under pycharm
# do your test with command line
from pyknp import Juman
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
from JapaneseTokenizer.jumanpp_wrapper.jumanpp_wrapper import JumanppWrapper, JumanppClient
from JapaneseTokenizer.common.sever_handler import JumanppHnadler
import pyknp
import unittest
import os
import logging
import socket
logger = logging.getLogger(__file__)
logger.level = logging.INFO


class TestJumanppWrapperPython3(unittest.TestCase):
    def setUp(self):
        # this is under MacOSX10
        self.path_to_juman_command = '/usr/local/bin/jumanpp'
        if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'jumanpp'

    def test_JumanppClient(self):
        test_sentence = '外国人参政権を欲しい。'
        # check socket
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        HOST = 'localhost'
        PORT = 12000
        try:
            s.connect((HOST, PORT))
            s.close()
        except:
            logger.warning("SKip server mode test because server is not working.")
        else:
            jumanpp_tokenizer = JumanppWrapper(server=HOST, port=PORT)
            client_obj = JumanppClient(hostname='localhost', port=12000)
            res = client_obj.query(sentence=test_sentence, pattern=rb'EOS')
            del res

    def test_jumanpp_servermode(self):
        ### test with list return object ###
        test_sentence = '外国人参政権を欲しい。'
        # check socket
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        HOST = 'localhost'
        PORT = 12000

        try:
            s.connect((HOST, PORT))
            s.close()
        except:
            logger.warning(msg='SKip server mode test because server is not working.')
        else:
            jumanpp_tokenizer = JumanppWrapper(server=HOST, port=PORT)
            list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
            assert isinstance(list_tokens, list)

            ### test with TokenizedSenetence return object ###
            tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
            assert isinstance(tokenized_obj, TokenizedSenetence)

            ### test with TokenizedSenetence return object and filter by chain expression ###
            pos_condtion = [('名詞',)]
            filtered_res = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False).filter(
                pos_condition=pos_condtion)
            assert isinstance(filtered_res, FilteredObject)
            assert isinstance(filtered_res.convert_list_object(), list)

    def test_jumanpp_servermode_stress(self):
        ### test with severmode with much stress ###
        test_sentence = '外国人参政権を欲しい。'
        # check socket
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        HOST = 'localhost'
        PORT = 12000
        try:
            s.connect((HOST, PORT))
            s.close()
        except:
            logger.warning(msg='SKip server mode test because server is not working.')
        else:
            jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000)
            for i in range(0, 1000):
                list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
                assert isinstance(list_tokens, list)
                assert '外国' in test_sentence
            del jumanpp_tokenizer

    def test_jumanpp_localmode_pyexpect(self):
        """pexpectを使ったプロセス呼び出しのテスト"""
        test_sentence = '外国人参政権を欲しい。'
        jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
        list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
        assert isinstance(list_tokens, list)

        jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
        tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
        assert isinstance(tokenized_obj, TokenizedSenetence)

    def test_jumanpp_huge_amount_text(self):
        """pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動"""
        logger.info('under testing of processing huge amount of text...')
        seq_test_sentence = ['外国人参政権を欲しい。'] * 500
        jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
        for i, test_s in enumerate(seq_test_sentence):
            tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s)
            self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence))
            if not i == 0 and i % 100 == 0:
                """強制的にプロセスを殺して再起動"""
                logger.info('It forces stop unix process.')
                jumanpp_tokenizer.jumanpp_obj.restart_process()
        else:
            pass


if __name__ == '__main__':
    unittest.main()

================================================
FILE: test/test_kytea_wrapper_python2.py
================================================
# -*- coding: utf-8 -*-
from JapaneseTokenizer.kytea_wrapper import KyteaWrapper
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
import unittest

class TestKyteaWrapperPython2(unittest.TestCase):

    def setUp(self):
        pass

    def test_tokenization(self):
        input_sentence = u"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        kytea_wrapper = KyteaWrapper()
        tokenized_result = kytea_wrapper.tokenize(
            sentence=input_sentence,
            normalize=True,
            return_list=False,
            is_feature=True
        )
        assert isinstance(tokenized_result, TokenizedSenetence)
        for t_obj in tokenized_result.tokenized_objects:
            assert isinstance(t_obj, TokenizedResult)

        print('-'*30)
        tokenized_result_list = tokenized_result.convert_list_object()
        assert isinstance(tokenized_result_list, list)
        for t_obj_tuple in tokenized_result_list:
            assert isinstance(t_obj_tuple, tuple)

    def test_filter_pos(self):
        """
        """
        print (u'Filtering Test. POS condition is only 名詞')
        test_sentence = u"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        kytea_wrapper = KyteaWrapper()
        tokenized_result = kytea_wrapper.tokenize(
            sentence=test_sentence,
            normalize=True,
            return_list=False,
            is_feature=True
        )

        pos_condition = [(u'名詞', )]
        filtered_result = kytea_wrapper.filter(
            parsed_sentence=tokenized_result,
            pos_condition=pos_condition
        )

        assert isinstance(filtered_result, FilteredObject)
        for t_obj in filtered_result.tokenized_objects:
            assert isinstance(t_obj, TokenizedResult)
            assert isinstance(t_obj.word_surface, unicode)
            assert isinstance(t_obj.word_stem, unicode)
            assert isinstance(t_obj.tuple_pos, tuple)
            assert isinstance(t_obj.misc_info, dict)

            assert t_obj.tuple_pos[0] == u'名詞'

        print('-'*30)
        for stem_posTuple in filtered_result.convert_list_object():
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, unicode)
            assert isinstance(word_posTuple, tuple)

    def test_stopwords(self):
        stopword = [u'女優']
        print (u'Stopwords Filtering Test. Stopwords is {}'.format(u','.join(stopword)))
        test_sentence = u"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        kytea_wrapper = KyteaWrapper()
        token_objects = kytea_wrapper.tokenize(sentence=test_sentence,
                                               return_list=False,
                                               is_feature=True
                                               )
        filtered_result = kytea_wrapper.filter(
            parsed_sentence=token_objects,
            stopwords=stopword
        )

        check_flag = True
        print('-'*30)
        for stem_posTuple in filtered_result.convert_list_object():
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, unicode)
            assert isinstance(word_posTuple, tuple)
            if word_stem in stopword:
                check_flag = False
        assert check_flag


if __name__ == '__main__':
    unittest.main()

================================================
FILE: test/test_kytea_wrapper_python3.py
================================================
# -*- coding: utf-8 -*-
from JapaneseTokenizer.kytea_wrapper import KyteaWrapper
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
import unittest

class TestKyteaWrapperPython3(unittest.TestCase):

    def setUp(self):
        pass

    def test_tokenization(self):
        input_sentence = "紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        kytea_wrapper = KyteaWrapper()
        tokenized_result = kytea_wrapper.tokenize(
            sentence=input_sentence,
            normalize=True,
            return_list=False,
            is_feature=True
        )
        assert isinstance(tokenized_result, TokenizedSenetence)
        for t_obj in tokenized_result.tokenized_objects:
            assert isinstance(t_obj, TokenizedResult)

        #print('-'*30)
        tokenized_result_list = tokenized_result.convert_list_object()
        assert isinstance(tokenized_result_list, list)
        for t_obj_tuple in tokenized_result_list:
            assert isinstance(t_obj_tuple, tuple)

    def test_filter_pos(self):
        """
        """
        # 'Filtering Test. POS condition is only 名詞')
        test_sentence = "紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        kytea_wrapper = KyteaWrapper()
        tokenized_result = kytea_wrapper.tokenize(
            sentence=test_sentence,
            normalize=True,
            return_list=False,
            is_feature=True
        )
        pos_condition = [('名詞', )]
        filtered_result = kytea_wrapper.filter(
            parsed_sentence=tokenized_result,
            pos_condition=pos_condition
        )

        assert isinstance(filtered_result, FilteredObject)
        for t_obj in filtered_result.tokenized_objects:
            assert isinstance(t_obj, TokenizedResult)
            assert isinstance(t_obj.word_surface, str)
            assert isinstance(t_obj.word_stem, str)
            assert isinstance(t_obj.tuple_pos, tuple)
            assert isinstance(t_obj.misc_info, dict)

            assert t_obj.tuple_pos[0] == '名詞'

        for stem_posTuple in filtered_result.convert_list_object():
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, str)
            assert isinstance(word_posTuple, tuple)

    def test_stopwords(self):
        stopword = ['女優']
        # ('Stopwords Filtering Test. Stopwords is {}'.format(','.join(stopword)))
        test_sentence = "紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。"
        kytea_wrapper = KyteaWrapper()
        token_objects = kytea_wrapper.tokenize(sentence=test_sentence,
                                               return_list=False,
                                               is_feature=True
                                               )
        filtered_result = kytea_wrapper.filter(
            parsed_sentence=token_objects,
            stopwords=stopword
        )

        check_flag = True
        for stem_posTuple in filtered_result.convert_list_object():
            assert isinstance(stem_posTuple, tuple)
            word_stem = stem_posTuple[0]
            word_posTuple = stem_posTuple[1]
            assert isinstance(word_stem, str)
            assert isinstance(word_posTuple, tuple)
            if word_stem in stopword: check_flag = False
        assert check_flag


if __name__ == '__main__':
    unittest.main()


================================================
FILE: test/test_mecab_wrapper_python2.py
================================================
#! -*- coding: utf-8 -*-
__author__ = 'kensuke-mi'

import sys
import unittest
from JapaneseTokenizer.mecab_wrapper.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence
from six import string_types
import os
python_version = sys.version_info


class TestMecabWrapperPython2(unittest.TestCase):
    def setUp(self):
        self.test_senetence = u'紗倉 まな（さくらまな、1993年3月23日 - ）は、日本のAV女優。'
        self.test_sentence2 = u'午前零時。午前3時。3時。'
        self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')

    def test_neologd_parse(self):
        """* Test case
        - neologd辞書で正しく分割できることを確認する
        """
        mecab_obj = MecabWrapper(dictType='neologd')
        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence)
        self.assertTrue(parsed_obj, TokenizedSenetence)
        self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))
        self.assertTrue(all(isinstance(mrph, string_types) for mrph in parsed_obj.convert_list_object()))

        parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2)
        self.assertTrue(parsed_obj, TokenizedSenetence)
        self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))
        self.assertTrue(all(isinstance(mrph, string_types) for mrph in parsed_obj.convert_list_object()))

    def test_default_parse(self):
        """* Test case
        - デフォルトの状態で動作を確認する
        """
        dictType = "ipadic"
        mecab_obj = MecabWrapper(dictType=dictType)
        assert isinstance(mecab_obj, MecabWrapper)
        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
        assert isinstance(parsed_obj, list)
        if python_version >= (3, 0, 0):
            for morph in parsed_obj:
                assert isinstance(morph, str)
        else:
            for morph in parsed_obj:
                assert isinstance(morph, string_types)

    def test_init_userdict(self):
        # test when user dictionary is called
        mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)
        assert isinstance(mecab_obj, MecabWrapper)
        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
        is_ok = False
        for morph in parsed_obj:
            if u'さくらまな' == morph:
                is_ok = True
        else:
            pass
        assert is_ok

    def test_parse_jumandic(self):
        with self.assertRaises(Exception):
            mecab_obj = MecabWrapper(dictType='jumandic')
            assert isinstance(mecab_obj, MecabWrapper)

    def test_init_alldict(self):
        """* Test case
        - すべての辞書を利用した場合の動作を確認する
        """
        with self.assertRaises(Exception):
            mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict)
            assert isinstance(mecab_obj, MecabWrapper)


if __name__ == '__main__':
    unittest.main()


================================================
FILE: test/test_mecab_wrapper_python3.py
================================================
#! -*- coding: utf-8 -*-
__author__ = 'kensuke-mi'

import sys
import unittest
from JapaneseTokenizer.mecab_wrapper.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence
import os
python_version = sys.version_info


class TestMecabWrapperPython3(unittest.TestCase):
    def setUp(self):
        self.test_senetence = '紗倉 まな（さくらまな、1993年3月23日 - ）は、日本のAV女優。'
        self.test_sentence2 = '午前零時。午前3時。3時。'
        self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')

    def test_neologd_parse(self):
        # test using neologd dictionary
        mecab_obj = MecabWrapper(dictType='neologd')
        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence)
        self.assertTrue(parsed_obj, TokenizedSenetence)
        self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))
        self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object()))

        parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2)
        self.assertTrue(parsed_obj, TokenizedSenetence)
        self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))
        self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object()))

    def test_default_parse(self):
        # test default status
        dictType = "ipadic"
        mecab_obj = MecabWrapper(dictType=dictType)
        assert isinstance(mecab_obj, MecabWrapper)
        
        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
        assert isinstance(parsed_obj, list)
        for morph in parsed_obj:
            assert isinstance(morph, str)

        parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2, return_list=True)
        assert isinstance(parsed_obj, list)
        for morph in parsed_obj:
            assert isinstance(morph, str)

    def test_parse_jumandic(self):
        mecab_obj = MecabWrapper(dictType='jumandic')
        assert isinstance(mecab_obj, MecabWrapper)

        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
        assert isinstance(parsed_obj, TokenizedSenetence)
        for tokenized_obj in parsed_obj.tokenized_objects:
            if tokenized_obj.word_stem == '女優':
                # ドメイン:文化・芸術 is special output only in Jumandic
                assert 'ドメイン:文化・芸術' in tokenized_obj.analyzed_line

    def test_parse_userdic(self):
        pass

    def test_parse_dictionary_path(self):
        # put path to dictionary and parse sentence.
        path_default_ipadic = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd'
        if os.path.exists(path_default_ipadic):
            mecab_obj = MecabWrapper(dictType=None, path_dictionary=path_default_ipadic)
            assert mecab_obj._path_dictionary == path_default_ipadic
            parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
            assert isinstance(parsed_obj, TokenizedSenetence)

    def test_init_userdict(self):
        # this test should be error response.
        mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)
        assert isinstance(mecab_obj, MecabWrapper)
        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
        assert isinstance(parsed_obj, TokenizedSenetence)
        is_ok = False
        for tokenized_obj in parsed_obj.tokenized_objects:
            if tokenized_obj.word_stem == 'さくらまな':
                is_ok = True
        assert is_ok


if __name__ == '__main__':
    unittest.main()


================================================
FILE: travis-mecab-install.sh
================================================
#!/bin/bash
# from https://gist.github.com/dtan4/351d031bec0c3d45cd8f
# see also http://qiita.com/dtan4/items/c6a087666296fbd5fffb

base_dir=`pwd`

wget -O mecab-0.996.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE'
tar zxfv mecab-0.996.tar.gz
cd mecab-0.996
./configure --enable-utf8-only
make
make check
sudo make install
sudo ldconfig

cd $base_dir

wget -O mecab-ipadic-2.7.0-20070801.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM'
tar zxfv mecab-ipadic-2.7.0-20070801.tar.gz
cd mecab-ipadic-2.7.0-20070801
./configure --with-charset=utf8
make
sudo make install
sudo ldconfig

wget -O jumandic.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM'
tar zxfv jumandic.tar.gz
cd mecab-jumandic-7.0-20130310
./configure --with-charset=utf8
make
sudo make install
sudo ldconfig

cd $base_dir
rm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801 mecab-jumandic-7.0-20130310