[
  {
    "path": ".gitignore",
    "content": ".idea/\nJapaneseTokenizer.egg-info/\nbuild/\ndist/\n*eggs/\npyknp.egg-info/\n.python-version\n*pyc\nmorphogySplitters/\nMykytea-python/\n.DS_Store\n*tox\n.cache/\npython/\npython2/"
  },
  {
    "path": ".travis.yml",
    "content": "language: python\npython:\n  - 2.7\n  - 3.5\naddons:\n  apt:\n    packages:\n    - git\n    - make\n    - curl\n    - xz-utils\n    - file\n    - pandoc\n    - libboost-all-dev\n    - language-pack-ja-base\n    - language-pack-ja\n    - ibus-mozc\n    - gcc-5\n    - g++-5\n    - build-essential\n    - swig\n    sources:\n    - ubuntu-toolchain-r-test\nbefore_install:\n  - sudo apt-get update -qq\n  - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1\n  - sudo update-locale LANG=ja_JP.UTF-8 LANGUAGE=\"ja_JP:ja\"\n  - mkdir ./target\n  - export CC=\"gcc-5\"\n  - export CXX=\"g++-5\"\n  - export CFLAGS=-std=c++11\n  - export CXXFLAGS=-std=c++11\n  - sudo bash travis-mecab-install.sh\n  - which mecab-config\n  - sudo make install\n  - git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git\n  - cd mecab-ipadic-neologd && echo yes | sudo ./bin/install-mecab-ipadic-neologd && cd ../\n  - sudo juman -S\ninstall:\n  - python --version\n  - python setup.py install\n  - pip install coveralls coverage nose\nscript:\n  - coverage run --source=JapaneseTokenizer setup.py test\nafter_success:\n  - coveralls\nnotifications:\n  email:\n    recipients:\n      - kensuke.mit@gmail.com\n    on_success: always\n    on_failure: always"
  },
  {
    "path": "JapaneseTokenizer/__init__.py",
    "content": "from JapaneseTokenizer.mecab_wrapper import MecabWrapper\nfrom JapaneseTokenizer.juman_wrapper import JumanWrapper\nfrom JapaneseTokenizer.datamodels import TokenizedSenetence\nfrom JapaneseTokenizer.datamodels import FilteredObject\nfrom JapaneseTokenizer.kytea_wrapper import KyteaWrapper\nfrom JapaneseTokenizer.jumanpp_wrapper import JumanppWrapper"
  },
  {
    "path": "JapaneseTokenizer/common/__init__.py",
    "content": "__author__ = 'kensuke-mi'\n"
  },
  {
    "path": "JapaneseTokenizer/common/juman_utils.py",
    "content": "from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence\nfrom typing import Tuple\nimport pyknp\nfrom six import text_type\n\n\"\"\"These functions are for utilization of Juman\"\"\"\n\n\ndef extract_morphological_information(mrph_object, is_feature, is_surface):\n    # type: (pyknp.Morpheme, bool, bool) -> TokenizedResult\n    \"\"\"This method extracts morphlogical information from token object.\n    \"\"\"\n    assert isinstance(mrph_object, pyknp.Morpheme)\n    assert isinstance(is_feature, bool)\n    assert isinstance(is_surface, bool)\n\n    surface = mrph_object.midasi\n    word_stem = mrph_object.genkei\n\n    tuple_pos = (mrph_object.hinsi, mrph_object.bunrui)\n\n    misc_info = {\n        'katuyou1': mrph_object.katuyou1,\n        'katuyou2': mrph_object.katuyou2,\n        'imis': mrph_object.imis,\n        'repname': mrph_object.repname\n    }\n\n    token_object = TokenizedResult(\n        node_obj=None,\n        tuple_pos=tuple_pos,\n        word_stem=word_stem,\n        word_surface=surface,\n        is_feature=is_feature,\n        is_surface=is_surface,\n        misc_info=misc_info\n    )\n\n    return token_object\n\n\ndef feature_parser(uni_feature, word_surface):\n    # type: (text_type, text_type) -> Tuple[Tuple[text_type, text_type, text_type], text_type]\n    \"\"\"\n    Parse the POS feature output by Mecab\n    :param uni_feature unicode:\n    :return ( (pos1, pos2, pos3), word_stem ):\n    \"\"\"\n    list_feature_items = uni_feature.split(',')\n    # if word has no feature at all\n    if len(list_feature_items) == 1: return ('*'), ('*')\n\n    pos1 = list_feature_items[0]\n    pos2 = list_feature_items[1]\n    pos3 = list_feature_items[2]\n    tuple_pos = (pos1, pos2, pos3)\n\n    # if without constraint(output is normal mecab dictionary like)\n    if len(list_feature_items) == 9:\n        word_stem = list_feature_items[6]\n    # if with constraint(output format depends on Usedict.txt)\n    else:\n        word_stem = word_surface\n\n    return tuple_pos, word_stem\n"
  },
  {
    "path": "JapaneseTokenizer/common/sever_handler.py",
    "content": "#! -*- coding: utf-8 -*-\nimport subprocess\nfrom subprocess import Popen, PIPE, STDOUT\nimport multiprocessing\n# socket object\nimport socket\n# logger\nfrom JapaneseTokenizer import init_logger\nimport logging\n# typing\nfrom typing import Union\n# else\nfrom six import text_type\nimport six\nimport pexpect\nimport shutil\nimport signal\nimport os\nlogger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))\n\n\nclass ProcessDownException(Exception):\n    pass\n\n\nclass UnixProcessHandler(object):\n    def __init__(self,\n                 command,\n                 option=None,\n                 pattern='EOS',\n                 timeout_second=10):\n        # type: (text_type,text_type,text_type,int)->None\n        \"\"\"* Get communication with unix process using pexpect module.\"\"\"\n        self.command = command\n        self.timeout_second = timeout_second\n        self.pattern = pattern\n        self.option = option\n        self.launch_process(command)\n\n    def __del__(self):\n        if hasattr(self, \"process_analyzer\"):\n            self.process_analyzer.kill(sig=9)\n\n    def launch_process(self, command):\n        # type: (Union[bytes,text_type])->None\n        \"\"\"* What you can do\n        - It starts process and keep it.\n        \"\"\"\n        if not self.option is None:\n            command_plus_option = self.command + \" \" + self.option\n        else:\n            command_plus_option = self.command\n\n        if six.PY3:\n            if shutil.which(command) is None:\n                raise Exception(\"No command at {}\".format(command))\n            else:\n                self.process_analyzer = pexpect.spawnu(command_plus_option)\n                self.process_id = self.process_analyzer.pid\n        else:\n            doc_command_string = \"echo '' | {}\".format(command)\n            command_check = os.system(doc_command_string)\n            if not command_check == 0:\n                raise Exception(\"No command at {}\".format(command))\n            else:\n                self.process_analyzer = pexpect.spawnu(command_plus_option)\n                self.process_id = self.process_analyzer.pid\n\n    def restart_process(self):\n        # type: ()->None\n        if not self.option is None:\n            command_plus_option = self.command + \" \" + self.option\n        else:\n            command_plus_option = self.command\n\n        self.process_analyzer.kill(sig=9)\n        self.process_analyzer = pexpect.spawnu(command_plus_option)\n        self.process_id = self.process_analyzer.pid\n\n    def stop_process(self):\n        # type: ()->bool\n        \"\"\"* What you can do\n        - You're able to stop the process which this instance has now.\n        \"\"\"\n        if hasattr(self, \"process_analyzer\"):\n            self.process_analyzer.kill(sig=9)\n        else:\n            pass\n\n        return True\n\n    def __query(self, input_string):\n        # type: (text_type)->text_type\n        \"\"\"* What you can do\n        - It takes the result of Juman++\n        - This function monitors time which takes for getting the result.\n        \"\"\"\n        signal.signal(signal.SIGALRM, self.__notify_handler)\n        signal.alarm(self.timeout_second)\n        self.process_analyzer.sendline(input_string)\n        buffer = \"\"\n        while True:\n            line_string = self.process_analyzer.readline()  # type: text_type\n            if line_string.strip() == input_string:\n                \"\"\"Skip if process returns the same input string\"\"\"\n                continue\n            elif line_string.strip() == self.pattern:\n                buffer += line_string\n                signal.alarm(0)\n                return buffer\n            else:\n                buffer += line_string\n\n    def __notify_handler(self, signum, frame):\n        raise ProcessDownException(\"\"\"It takes longer time than {time} seconds. You're able to try, \n        1. Change your setting of 'timeout_second' parameter\n        2. Run restart_process() method when the exception happens.\"\"\".format(**{\"time\": self.timeout_second}))\n\n    def query(self, input_string):\n        # type: (text_type)->text_type\n        return self.__query(input_string=input_string)\n\n\nclass JumanppHnadler(UnixProcessHandler):\n\n    def __init__(self,\n                 jumanpp_command,\n                 option = None,\n                 pattern = 'EOS',\n                 timeout_second = 10):\n        # type: (text_type,text_type,text_type,int)->None\n        super(JumanppHnadler, self).__init__(command=jumanpp_command, option=option, pattern=pattern, timeout_second=timeout_second)\n\n    def launch_jumanpp_process(self, command):\n        # type: (text_type)->None\n        return self.launch_process(command)\n"
  },
  {
    "path": "JapaneseTokenizer/common/text_preprocess.py",
    "content": "# -*- coding: utf-8 -*-\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\nfrom __future__ import division\nfrom six import text_type\nimport jaconv\nimport six\nimport re\nimport unicodedata\nfrom JapaneseTokenizer import init_logger\nimport logging\nlogger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))\n__author__ = 'kensuke-mi'\n\nif six.PY2:\n    def u(str): return str.decode(\"utf-8\")\n    def b(str): return str\n    pass\nelse: # python3\n    def u(str): return str\n    def b(str): return str.encode(\"utf-8\")\n    pass\n\ntry:\n    import neologdn\n    is_neologdn_valid = True\nexcept:\n    logger.warning(\"neologdn package is not installed yet. You could not call neologd dictionary.\")\n    is_neologdn_valid = False\n\nSTRING_EXCEPTION = set([u('*')])\n\n\ndef denormalize_text(input_text):\n    # type: (text_type)->text_type\n    \"\"\"* What you can do\n    - It converts text into standard japanese writing way\n\n    * Note\n    - hankaku-katakana is to zenkaku-katakana\n    - zenkaku-eisu is to hankaku-eisu\n    \"\"\"\n    if input_text in STRING_EXCEPTION:\n        return input_text\n    else:\n        return jaconv.z2h(input_text, kana=False, ascii=True, digit=True)\n\n\ndef normalize_text(input_text,\n                   dictionary_mode='ipadic',\n                   new_line_replaced='。',\n                   is_replace_eos=True,\n                   is_kana=True,\n                   is_ascii=True,\n                   is_digit=True):\n    # type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type\n    \"\"\"* What you can do\n    - It converts input-text into normalized-text which is good for tokenizer input.\n\n    * Params\n    - new_line_replaced: a string which replaces from \\n string.\n    \"\"\"\n    if is_replace_eos:\n        without_new_line = input_text.replace('\\n', new_line_replaced)\n    else:\n        without_new_line = new_line_replaced\n\n    if dictionary_mode=='neologd' and is_neologdn_valid:\n        return neologdn.normalize(normalize_text_normal_ipadic(without_new_line))\n    elif dictionary_mode=='neologd' and is_neologdn_valid == False:\n        raise Exception(\"You could not call neologd dictionary bacause you do NOT install the package neologdn.\")\n    else:\n        return normalize_text_normal_ipadic(without_new_line, kana=is_kana, ascii=is_ascii, digit=is_digit)\n\n\ndef normalize_text_normal_ipadic(input_text, kana=True, ascii=True, digit=True):\n    # type: (text_type,bool,bool,bool)->text_type\n    \"\"\"\n    * All hankaku Katanaka is converted into Zenkaku Katakana\n    * All hankaku English alphabet and numberc string are converted into Zenkaku one\n    \"\"\"\n    return jaconv.h2z(input_text, kana=kana, ascii=ascii, digit=digit)\n"
  },
  {
    "path": "JapaneseTokenizer/common/timeout_handler.py",
    "content": "#! -*- coding: utf-8 -*-\nfrom functools import wraps\n\n\nclass TimeoutException(Exception):\n    pass\n\n\ndef handler_func(msg):\n    raise TimeoutException()\n\n\ndef on_timeout(limit, handler=handler_func, hint=None):\n    \"\"\"\n    指定した実行時間に終了しなかった場合、handlerをhint/limitを引数にして呼び出します\n    @on_timeout(limit=3600, handler=notify_func, hint=u'長い計算')\n    def long_time_function():\n    \"\"\"\n    def notify_handler(signum, frame):\n        handler(\"'%s' is not finished in %d second(s).\" % (hint, limit))\n\n    def __decorator(function):\n        def __wrapper(*args, **kwargs):\n            import signal\n            signal.signal(signal.SIGALRM, notify_handler)\n            signal.alarm(limit)\n            result = function(*args, **kwargs)\n            signal.alarm(0)\n            return result\n        return wraps(function)(__wrapper)\n    return __decorator\n"
  },
  {
    "path": "JapaneseTokenizer/datamodels.py",
    "content": "#! -*- coding: utf-8 -*-\n# normalize module #\nfrom JapaneseTokenizer.common.text_preprocess import normalize_text, denormalize_text\n# datemodels #\nfrom MeCab import Node\n# typing #\nfrom typing import List, Union, Any, Tuple, Dict, Callable, Optional\nfrom future.utils import text_type, string_types\nimport sys\nimport six\n__author__ = 'kensuke-mi'\n\npython_version = sys.version_info\n\n\ndef __is_sotpwords(token, stopwords):\n    \"\"\"This function filters out stopwords. If token is in stopwords list, return True; else return False\n    \"\"\"\n    if token in stopwords:\n        return True\n    else:\n        return False\n\n\ndef __is_valid_pos(pos_tuple, valid_pos):\n    # type: (Tuple[text_type,...],List[Tuple[text_type,...]])->bool\n    \"\"\"This function checks token's pos is with in POS set that user specified.\n    If token meets all conditions, Return True; else return False\n    \"\"\"\n    def is_valid_pos(valid_pos_tuple):\n        # type: (Tuple[text_type,...])->bool\n        length_valid_pos_tuple = len(valid_pos_tuple)\n        if valid_pos_tuple == pos_tuple[:length_valid_pos_tuple]:\n            return True\n        else:\n            return False\n\n    seq_bool_flags = [is_valid_pos(valid_pos_tuple) for valid_pos_tuple in valid_pos]\n\n    if True in set(seq_bool_flags):\n        return True\n    else:\n        return False\n\n\ndef filter_words(tokenized_obj, valid_pos, stopwords, check_field_name='stem'):\n    # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type],text_type) -> FilteredObject\n    \"\"\"This function filter token that user don't want to take.\n    Condition is stopword and pos.\n\n    * Input\n    - valid_pos\n        - List of Tuple which has POS element to keep.\n        - Keep in your mind, each tokenizer has different POS structure.\n         >>> [('名詞', '固有名詞'), ('動詞', )]\n    - stopwords\n        - List of str, which you'd like to remove\n        >>> ['残念', '今日']\n    \"\"\"\n    assert isinstance(tokenized_obj, TokenizedSenetence)\n    assert isinstance(valid_pos, list)\n    assert isinstance(stopwords, list)\n\n    filtered_tokens = []\n    for token_obj in tokenized_obj.tokenized_objects:\n        assert isinstance(token_obj, TokenizedResult)\n        if check_field_name=='stem':\n            res_stopwords = __is_sotpwords(token_obj.word_stem, stopwords)\n        else:\n            res_stopwords = __is_sotpwords(token_obj.word_surface, stopwords)\n\n        res_pos_condition = __is_valid_pos(token_obj.tuple_pos, valid_pos)\n\n        # case1: only pos filtering is ON\n        if valid_pos != [] and stopwords == []:\n            if res_pos_condition: filtered_tokens.append(token_obj)\n        # case2: only stopwords filtering is ON\n        if valid_pos == [] and stopwords != []:\n            if res_stopwords is False: filtered_tokens.append(token_obj)\n        # case3: both condition is ON\n        if valid_pos != [] and stopwords != []:\n            if res_stopwords is False and res_pos_condition: filtered_tokens.append(token_obj)\n\n    filtered_object = FilteredObject(\n        sentence=tokenized_obj.sentence,\n        tokenized_objects=filtered_tokens,\n        pos_condition=valid_pos,\n        stopwords=stopwords\n    )\n\n    return filtered_object\n\n\nclass TokenizedResult(object):\n    def __init__(self,\n                 node_obj,\n                 tuple_pos,\n                 word_stem,\n                 word_surface,\n                 is_feature=True,\n                 is_surface=False,\n                 misc_info=None,\n                 analyzed_line=None):\n        # type: (Optional[Node], Tuple[text_type, ...], str, str, bool, bool, Optional[Dict[str, Any]], str)->None\n        assert isinstance(node_obj, (Node, type(None)))\n        assert isinstance(tuple_pos, (string_types, tuple))\n        assert isinstance(word_stem, (string_types))\n        assert isinstance(word_surface, text_type)\n        assert isinstance(misc_info, (type(None), dict))\n\n        self.node_obj = node_obj\n        self.word_stem = word_stem\n        self.word_surface = word_surface\n        self.is_surface = is_surface\n        self.is_feature = is_feature\n        self.misc_info = misc_info\n        self.analyzed_line = analyzed_line\n\n        if isinstance(tuple_pos, tuple):\n            self.tuple_pos = tuple_pos\n        elif isinstance(tuple_pos, string_types):\n            self.tuple_pos = ('*', )\n        else:\n            raise Exception('Error while parsing feature object. {}'.format(tuple_pos))\n\n\nclass TokenizedSenetence(object):\n    def __init__(self, sentence, tokenized_objects, string_encoding='utf-8'):\n        # type: (text_type, List[TokenizedResult], text_type)->None\n        \"\"\"* Parameters\n        - sentence: sentence\n        - tokenized_objects: list of TokenizedResult object\n        - string_encoding: Encoding type of string type. This option is used only under python2.x\n        \"\"\"\n        assert isinstance(sentence, text_type)\n        assert isinstance(tokenized_objects, list)\n\n        self.sentence = sentence\n        self.tokenized_objects = tokenized_objects\n        self.string_encoding = string_encoding\n\n\n    def __extend_token_object(self, token_object,\n                              is_denormalize=True,\n                              func_denormalizer=denormalize_text):\n        # type: (TokenizedResult,bool,Callable[[str],str])->Tuple\n        \"\"\"This method creates dict object from token object.\n        \"\"\"\n        assert isinstance(token_object, TokenizedResult)\n\n        if is_denormalize:\n            if token_object.is_feature == True:\n                if token_object.is_surface == True:\n                    token = (func_denormalizer(token_object.word_surface), token_object.tuple_pos)\n                else:\n                    token = (func_denormalizer(token_object.word_stem), token_object.tuple_pos)\n            else:\n                if token_object.is_surface == True:\n                    token = func_denormalizer(token_object.word_surface)\n                else:\n                    token = func_denormalizer(token_object.word_stem)\n        else:\n            if token_object.is_feature == True:\n                if token_object.is_surface == True:\n                    token = (token_object.word_surface, token_object.tuple_pos)\n                else:\n                    token = (token_object.word_stem, token_object.tuple_pos)\n            else:\n                if token_object.is_surface == True:\n                    token = token_object.word_surface\n                else:\n                    token = token_object.word_stem\n\n        return token\n\n    def convert_list_object(self,\n                            is_denormalize=True,\n                            func_denormalizer=denormalize_text):\n        # type: (bool,Callable[[str],str])->List[Union[str, Tuple[str,...]]]\n        \"\"\"* What you can do\n        - You extract string object from TokenizedResult object\n\n        * Args\n        - is_denormalize: boolen object. True; it makes denormalize string\n        - func_denormalizer: callable object. de-normalization function.\n        \"\"\"\n        sentence_in_list_obj = [\n            self.__extend_token_object(token_object,is_denormalize,func_denormalizer)\n            for token_object\n            in self.tokenized_objects\n        ]\n\n        return sentence_in_list_obj\n\n    def __convert_string_type(self, p_c_tuple):\n        # type: (Tuple[text_type,...])->Tuple[text_type]\n        \"\"\"* What you can do\n        - it normalizes string types into str\n        \"\"\"\n        if not isinstance(p_c_tuple, tuple):\n            raise Exception('Pos condition expects tuple of string. However = {}'.format(p_c_tuple))\n\n        converted = [text_type] * len(p_c_tuple)\n        for i, pos_element in enumerate(p_c_tuple):\n            if six.PY2 and isinstance(pos_element, str):\n                \"\"\"str into unicode if python2.x\"\"\"\n                converted[i] = pos_element.decode(self.string_encoding)\n            elif six.PY2 and isinstance(pos_element, text_type):\n                converted[i] = pos_element\n            elif six.PY3:\n                converted[i] = pos_element\n            else:\n                raise Exception()\n\n        return tuple(converted)\n\n    def __check_pos_condition(self, pos_condistion):\n        # type: (List[Tuple[text_type, ...]])->List[Tuple[text_type, ...]]\n        \"\"\"* What you can do\n        - Check your pos condition\n        - It converts character type into unicode if python version is 2.x\n        \"\"\"\n        assert isinstance(pos_condistion, list)\n\n        return [self.__convert_string_type(p_c_tuple) for p_c_tuple in pos_condistion]\n\n    def filter(self,\n               pos_condition=None,\n               stopwords=None,\n               is_normalize=True,\n               func_normalizer=normalize_text,\n               check_field_name='stem'):\n        # type: (List[Tuple[text_type,...]], List[text_type], bool, Callable[[text_type], text_type],text_type)->FilteredObject\n        \"\"\"* What you can do\n        - It filters out token which does NOT meet the conditions (stopwords & part-of-speech tag)\n        - Under python2.x, pos_condition & stopwords are converted into unicode type.\n\n        * Parameters\n        - pos_condition: list of part-of-speech(pos) condition. The pos condition is tuple is variable length.\n        You can specify hierarchical structure of pos condition with variable tuple.\n        The hierarchy of pos condition follows definition of dictionary.\n            - For example, in mecab you can take words with 名詞 if ('名詞',)\n            - For example, in mecab you can take words with 名詞-固有名詞 if ('名詞', '固有名詞')\n        - stopwords: list of word which you would like to remove\n        - is_normalize: Boolean flag for normalize stopwords.\n        - func_normalizer: Function object for normalization. The function object must be the same one as when you use tokenize.\n        - check_field_name: Put field name to check if stopword or NOT. Kytea does not have stem form of word, put 'surface' instead.\n\n        * Example\n        >>> pos_condition = [('名詞', '一般'), ('形容詞', '自立'), ('助詞', '格助詞', '一般')]\n        >>> stopwords = ['これ', 'それ']\n        \"\"\"\n        assert isinstance(pos_condition, (type(None), list))\n        assert isinstance(stopwords, (type(None), list))\n\n        if stopwords is None:\n            s_words = []\n        elif six.PY2 and all((isinstance(s, str) for s in stopwords)):\n            \"\"\"under python2.x, from str into unicode\"\"\"\n            if is_normalize:\n                s_words = [func_normalizer(s.decode(self.string_encoding)) for s in stopwords]\n            else:\n                s_words = [s.decode(self.string_encoding) for s in stopwords]\n        else:\n            if is_normalize:\n                s_words = [func_normalizer(s) for s in stopwords]\n            else:\n                s_words = stopwords\n\n\n        if pos_condition is None:\n            p_condition = []\n        else:\n            p_condition = self.__check_pos_condition(pos_condition)\n\n        filtered_object = filter_words(\n            tokenized_obj=self,\n            valid_pos=p_condition,\n            stopwords=s_words,\n            check_field_name=check_field_name\n        )\n        assert isinstance(filtered_object, FilteredObject)\n\n        return filtered_object\n\n\nclass FilteredObject(TokenizedSenetence):\n    def __init__(self, sentence, tokenized_objects, pos_condition, stopwords):\n        # type: (str, List[TokenizedResult], List[str, ...], List[str])->None\n        super(FilteredObject, self).__init__(\n            sentence=sentence,\n            tokenized_objects=tokenized_objects\n        )\n        self.pos_condition=pos_condition\n        self.stopwords=stopwords\n\n\n\n\n"
  },
  {
    "path": "JapaneseTokenizer/init_logger.py",
    "content": "LOGGER_NAME = 'JapaneseTokenizer'\n\nimport logging\nimport sys\nfrom logging import getLogger, Formatter, Logger, StreamHandler\n\n# Formatter\ncustmoFormatter = Formatter(\n    fmt='[%(asctime)s]%(levelname)s - %(filename)s#%(funcName)s:%(lineno)d: %(message)s',\n    datefmt='Y/%m/%d %H:%M:%S'\n)\n\n# StreamHandler\nSTREAM_LEVEL = logging.DEBUG\nSTREAM_FORMATTER = custmoFormatter\nSTREAM = sys.stderr\n\nst_handler = StreamHandler(stream=STREAM)\nst_handler.setLevel(STREAM_LEVEL)\nst_handler.setFormatter(STREAM_FORMATTER)\n\n\ndef init_logger(logger):\n    # type: (logging.Logger) -> logging.Logger\n    logger.addHandler(st_handler)\n    logger.propagate = False\n\n    return logger\n"
  },
  {
    "path": "JapaneseTokenizer/juman_wrapper/__init__.py",
    "content": "__author__ = 'kensuke-mi'\nfrom .juman_wrapper import JumanWrapper\n"
  },
  {
    "path": "JapaneseTokenizer/juman_wrapper/juman_wrapper.py",
    "content": "# -*- coding: utf-8 -*-\n# package module\nfrom JapaneseTokenizer.object_models import WrapperBase\nfrom JapaneseTokenizer.common import text_preprocess\nfrom JapaneseTokenizer.datamodels import FilteredObject, TokenizedResult, TokenizedSenetence\nfrom JapaneseTokenizer import init_logger\nfrom JapaneseTokenizer.common.sever_handler import JumanppHnadler\n# else\nfrom typing import List, Union, Callable, Tuple\nfrom six import text_type\nfrom pyknp import MList\nimport logging\nimport sys\nimport os\nimport six\n\nlogger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))\n__author__ = 'kensuke-mi'\n\npython_version = sys.version_info\n\ntry:\n    import pyknp\nexcept ImportError:\n    logger.warning(msg='pyknp is not ready to use. Install first if you would like to use pyknp wrapper.')\n\nif six.PY3:\n    import socket\n    import re\n\n    class MonkeyPatchSocket(object):\n        \"\"\"* Class for overwriting pyknp.Socket because it is only for python2.x\"\"\"\n        def __init__(self, hostname, port, option=None):\n            try:\n                self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n                self.sock.connect((hostname, port))\n            except:\n                raise\n            if option is not None:\n                self.sock.send(option)\n            data = b\"\"\n            while b\"OK\" not in data:\n                # while isinstance(data, bytes) and b\"OK\" not in data:\n                data = self.sock.recv(1024)\n\n        def __del__(self):\n            if self.sock:\n                self.sock.close()\n\n        def query(self, sentence, pattern):\n            # type: (str,str)->str\n            assert(isinstance(sentence, six.text_type))\n            sentence_bytes = sentence.encode('utf-8').strip()\n            pattern_bytes = pattern.encode('utf-8')\n\n            self.sock.sendall(sentence_bytes + b\"\\n\")\n            data = self.sock.recv(1024)\n            assert isinstance(data, bytes)\n            recv = data\n            while not re.search(pattern_bytes, recv):\n                data = self.sock.recv(1024)\n                recv = recv + data\n            return recv.strip().decode('utf-8')\n\n\nclass JumanWrapper(WrapperBase):\n    def __init__(self,\n                 command='juman',\n                 server=None,\n                 port=32000,\n                 timeout=30,\n                 rcfile=None,\n                 option='-e2 -B',\n                 pattern='EOS',\n                 is_use_pyknp=False,\n                 **args):\n        # type: (text_type, text_type, int, int, text_type, Union[bytes, text_type], Union[bytes, text_type], bool, **str)->None\n        \"\"\"* Class to call Juman tokenizer\n        \"\"\"\n\n        self.timeout = timeout\n        self.pattern = pattern\n        self.option = option\n        self.command = command\n        if not rcfile is None and not os.path.exists(rcfile):\n            raise FileExistsError('rcfile does not exist at {}'.format(rcfile))\n        if not server is None:\n            # It converts from str into bytes only for sever mode #\n            self.option = self.option.encode('utf-8')  # type: Union[str,bytes]\n            self.pattern = self.pattern.encode('utf-8')  # type: Union[str,bytes]\n        else:\n            pass\n\n        # check os #\n        if os.name == 'nt':\n            if not is_use_pyknp:\n                logger.warning(msg='It forces is_use_pyknp = True on Windows.')\n            else:\n                pass\n            self.is_use_pyknp = True\n        else:\n            pass\n\n        if server is not None:\n            # use server mode #\n            self.juman = pyknp.Juman(command=command, server=server, port=port,\n                                     timeout=self.timeout, rcfile=rcfile, option=option,\n                                     pattern=pattern, jumanpp=False, **args)\n            if six.PY3:\n                # It overwrites juman_lines() method #\n                self.juman.juman_lines = self.__monkey_patch_juman_lines\n        elif is_use_pyknp and server is None:\n            # use unix process with pyknp\n            self.juman = pyknp.Juman(command=command, server=server, port=port,\n                                     timeout=self.timeout, rcfile=rcfile, option=option,\n                                     pattern=pattern, jumanpp=False, **args)\n        else:\n            # use unix process with pexpect(RECOMMENDED) #\n            self.juman = JumanppHnadler(jumanpp_command=command,\n                                        option=self.option,\n                                        pattern=self.pattern,\n                                        timeout_second=self.timeout)\n\n    def __del__(self):\n        if hasattr(self, \"juman\"):\n            if isinstance(self.juman, JumanppHnadler):\n                self.juman.stop_process()\n\n    def __monkey_patch_juman_lines(self, input_str):\n        # type: (text_type)->text_type\n        \"\"\"* What you can do\n        - It overwrites juman_line() method because this method causes TypeError in python3\n        \"\"\"\n        assert isinstance(self.juman, pyknp.Juman)\n        if not self.juman.socket and not self.juman.subprocess:\n            if self.juman.server is not None:\n                self.juman.socket = MonkeyPatchSocket(self.juman.server, self.juman.port, b\"RUN -e2\\n\")\n            else:\n                command = \"%s %s\" % (self.juman.command, self.juman.option)\n                if self.juman.rcfile:\n                    command += \" -r %s\" % self.juman.rcfile\n                self.juman.subprocess = pyknp.Subprocess(command)\n        if self.juman.socket:\n            return self.juman.socket.query(input_str, pattern=self.juman.pattern)\n        return self.juman.subprocess.query(input_str, pattern=self.juman.pattern)\n\n    def __extract_morphological_information(self, mrph_object, is_feature, is_surface):\n        \"\"\"This method extracts morphlogical information from token object.\n        \"\"\"\n        assert isinstance(mrph_object, pyknp.Morpheme)\n        assert isinstance(is_feature, bool)\n        assert isinstance(is_surface, bool)\n\n        surface = mrph_object.midasi\n        word_stem = mrph_object.genkei\n\n        tuple_pos = (mrph_object.hinsi, mrph_object.bunrui)\n\n        misc_info = {\n            'katuyou1': mrph_object.katuyou1,\n            'katuyou2': mrph_object.katuyou2,\n            'imis': mrph_object.imis,\n            'repname': mrph_object.repname\n        }\n\n        token_object = TokenizedResult(\n            node_obj=None,\n            tuple_pos=tuple_pos,\n            word_stem=word_stem,\n            word_surface=surface,\n            is_feature=is_feature,\n            is_surface=is_surface,\n            misc_info=misc_info\n        )\n\n        return token_object\n\n    def call_juman_interface(self, input_str):\n        # type: (text_type)->MList\n        if isinstance(self.juman, pyknp.Juman):\n            result = self.juman.analysis(input_str)\n            return result\n        elif isinstance(self.juman, JumanppHnadler):\n            try:\n                result_analysis = self.juman.query(input_str)\n            except UnicodeDecodeError:\n                logger.warning(msg=\"Process is down by some reason. It restarts process automatically.\")\n                self.juman.restart_process()\n                result_analysis = self.juman.query(input_string=input_str)\n            return MList(result_analysis)\n        else:\n            raise Exception('Not defined.')\n\n    def tokenize(self,\n                 sentence,\n                 normalize=True,\n                 is_feature=False,\n                 is_surface=False,\n                 return_list=False,\n                 func_normalizer=text_preprocess.normalize_text):\n        # type: (text_preprocess, bool, bool, bool, bool, Callable[[str], text_type])->Union[List[text_type], TokenizedSenetence]\n        \"\"\"This method returns tokenized result.\n        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.\n        If return_list==False, this method returns TokenizedSenetence object.\n        \"\"\"\n        assert isinstance(normalize, bool)\n        assert isinstance(sentence, text_type)\n        normalized_sentence = func_normalizer(sentence)\n        result = self.call_juman_interface(normalized_sentence)\n\n        token_objects = [\n            self.__extract_morphological_information(\n                mrph_object=morph_object,\n                is_surface=is_surface,\n                is_feature=is_feature\n            )\n            for morph_object in result]\n\n        if return_list:\n            tokenized_objects = TokenizedSenetence(\n                sentence=sentence,\n                tokenized_objects=token_objects\n            )\n            return tokenized_objects.convert_list_object()\n        else:\n            tokenized_objects = TokenizedSenetence(\n                sentence=sentence,\n                tokenized_objects=token_objects)\n\n            return tokenized_objects\n\n    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):\n        # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type])->FilteredObject\n        assert isinstance(parsed_sentence, TokenizedSenetence)\n        assert isinstance(pos_condition, (type(None), list))\n        assert isinstance(stopwords, (type(None), list))\n\n        return parsed_sentence.filter(pos_condition, stopwords)\n"
  },
  {
    "path": "JapaneseTokenizer/jumanpp_wrapper/__init__.py",
    "content": "from .jumanpp_wrapper import JumanppWrapper"
  },
  {
    "path": "JapaneseTokenizer/jumanpp_wrapper/jumanpp_wrapper.py",
    "content": "#! -*- coding: utf-8 -*-\nfrom pyknp import Juman\nfrom pyknp import MList\n# modules\nfrom JapaneseTokenizer.object_models import WrapperBase\nfrom JapaneseTokenizer.common import text_preprocess, juman_utils\nfrom JapaneseTokenizer.common.sever_handler import JumanppHnadler, ProcessDownException\nfrom JapaneseTokenizer import init_logger\nfrom JapaneseTokenizer.datamodels import FilteredObject, TokenizedSenetence\nfrom typing import List, Dict, Tuple, Union, TypeVar, Any, Callable\n# timeout\nfrom JapaneseTokenizer.common.timeout_handler import on_timeout\nfrom six import text_type\nimport logging\nimport sys\nimport socket\nimport six\nimport re\nimport os\n__author__ = 'kensuke-mi'\n\nlogger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))\npython_version = sys.version_info\nContentsTypes = TypeVar('T')\n\ntry:\n    import pyknp\nexcept ImportError:\n    logger.warning(msg='pyknp is not ready to use. Install first if you would like to use pyknp wrapper.')\n\n\nif six.PY2:\n    ConnectionRefusedError = Exception \n    class JumanppClient(object):\n        \"\"\"Class for receiving data as client\"\"\"\n        def __init__(self, hostname, port, timeout=50, option=None):\n            # type: (text_type, int, int, Dict[text_type,Any])->None\n            try:\n                self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n                if isinstance(port, text_type):\n                    port = int(port)\n                self.sock.connect((hostname, port))\n            except:\n                raise Exception(\"There is no jumanpp server hostname={}, port={}\".format(hostname, port))\n            if option is not None:\n                self.sock.send(option)\n            data = ''\n            self.sock.settimeout(timeout)\n\n        def __del__(self):\n            if self.sock: self.sock.close()\n\n        def query(self, sentence, pattern):\n            # type: (text_type, bytes) -> text_type\n            assert (isinstance(sentence, six.text_type))\n            data = ''\n            self.sock.sendall(\"%s\\n\" % sentence.encode('utf-8').strip())\n            data = self.sock.recv(1024)\n            assert isinstance(data, bytes)\n            recv = data\n            while not re.search(pattern, recv):\n                data = self.sock.recv(1024)\n                recv = \"%s%s\" % (recv, data)\n            return recv.strip().decode('utf-8')\n\nelse:\n    class JumanppClient(object):\n        \"\"\"Class for receiving data as client\"\"\"\n        def __init__(self, hostname, port, timeout=50, option=None):\n            # type: (text_type, int, int, Dict[text_type,Any])->None\n            try:\n                self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n                if isinstance(port, str):\n                    port = int(port)\n                self.sock.connect((hostname, port))\n            except ConnectionRefusedError:\n                raise Exception(\"There is no jumanpp server hostname={}, port={}\".format(hostname, port))\n            except:\n                raise\n            if option is not None:\n                self.sock.send(option)\n            data = b\"\"\n            self.sock.settimeout(timeout)\n\n        def __del__(self):\n            if self.sock:\n                self.sock.close()\n\n        def query(self, sentence, pattern):\n            # type: (str, Union[str,bytes]) -> str\n            assert (isinstance(sentence, six.text_type))\n            if isinstance(pattern, str):\n                pattern = pattern.encode('utf-8')\n            self.sock.sendall(b\"%s\\n\" % sentence.encode('utf-8').strip())\n            data = self.sock.recv(1024)\n            assert isinstance(data, bytes)\n            recv = data\n            while not re.search(pattern, recv):\n                data = self.sock.recv(1024)\n                recv = b\"%s%s\" % (recv, data)\n            return recv.strip().decode('utf-8')\n\n\nclass JumanppWrapper(WrapperBase):\n    \"\"\"Class for Juman++\"\"\"\n\n    def __init__(self,\n                 command='jumanpp',\n                 timeout=30,\n                 pattern=r'EOS',\n                 server=None,\n                 port=12000,\n                 is_use_pyknp = False,\n                 ** args):\n        # type: (text_type,int,text_type,text_type,bool)\n        \"\"\"* What you can do\n        - You can select backend process of jumanpp.\n            - jumanpp-pexpect: It calls jumanpp on your local machine. It keeps jumanpp process running.\n            - jumanpp-pyknp: It calls jumanpp on your local machine. It launches jumanpp process everytime you call. Thus, this is slower than jumanpp-pexpect\n            - jumanpp-server: It calls jumannpp on somewhere else. Keep mind, you have jumanpp sever process somewhere.\n\n        * Parameters\n        - timeout: Time to wait from jumanpp process.\n        - is_use_pyknp: bool flag to decide if you use pyknp as backend process.  If True; you use pyknp. False; you use pexpect.\n        pexpect is much faster than you use pyknp. You can not use pexpect if you're using it on Windowns\n        - server: hostname where jumanpp is running\n        - port: port number where jumanpp is running\n        \"\"\"\n        self.eos_pattern = pattern\n        self.is_use_pyknp = is_use_pyknp\n\n\n        if six.PY2:\n            self.dummy_text = 'これはダミーテキストです'.decode('utf-8')\n        elif six.PY3:\n            self.dummy_text = 'これはダミーテキストです'\n\n        if not server is None:\n            pattern = pattern.encode('utf-8')\n        else:\n            pass\n\n        if os.name == 'nt':\n            \"\"\"It forces to use pyknp if it runs on Windows.\"\"\"\n            if not self.is_use_pyknp:\n                logger.warning(msg=\"You're not able to use pexpect in Windows. It forced to set is_use_pyknp = True\")\n            else:\n                pass\n            self.is_use_pyknp = True\n        else:\n            pass\n\n        if server is None and self.is_use_pyknp:\n            # jumanpp-pexpect #\n            logger.debug('jumanpp wrapper is initialized with pyknp package')\n            self.jumanpp_obj = Juman(\n                command=command,\n                timeout=timeout,\n                pattern=pattern,\n                jumanpp=True,\n                **args)\n        elif server is None:\n            # jumanpp-pexpect #\n            logger.debug('jumanpp wrapper is initialized with pexpect unix handler')\n            self.jumanpp_obj = JumanppHnadler(jumanpp_command=command, timeout_second=timeout, pattern=pattern)  # type: JumanppHnadler\n            # put dummy sentence to avoid exception just after command initialization #\n            res = self.jumanpp_obj.query(self.dummy_text)\n        else:\n            # jumanpp-server #\n            self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)\n\n    def __del__(self):\n        if hasattr(self, \"jumanpp_obj\"):\n            if isinstance(self.jumanpp_obj, JumanppClient):\n                self.jumanpp_obj.sock.close()\n            elif isinstance(self.jumanpp_obj, JumanppHnadler):\n                self.jumanpp_obj.stop_process()\n            else:\n                del self.jumanpp_obj\n        else:\n            pass\n\n    def call_juman_interface(self, input_str):\n        # type: (text_type) -> MList\n        \"\"\"* What you can do\n        - You call Juman tokenizer interface.\n\n        * Output\n        - pyknp.MList\n        \"\"\"\n        if isinstance(self.jumanpp_obj, Juman):\n            ml_token_object = self.jumanpp_obj.analysis(input_str=input_str)\n        elif isinstance(self.jumanpp_obj, JumanppHnadler):\n            try:\n                result_token = self.jumanpp_obj.query(input_string=input_str)\n            except ProcessDownException:\n                \"\"\"Unix process is down by any reason.\"\"\"\n                logger.warning(\"Re-starting unix process because it takes longer time than {} seconds...\".format(self.jumanpp_obj.timeout_second))\n                self.jumanpp_obj.restart_process()\n                self.jumanpp_obj.query(self.dummy_text)\n                result_token = self.jumanpp_obj.query(input_string=input_str)\n                ml_token_object = MList(result_token)\n            except UnicodeDecodeError:\n                logger.warning(msg=\"Process is down by some reason. It restarts process automatically.\")\n                self.jumanpp_obj.restart_process()\n                self.jumanpp_obj.query(self.dummy_text)\n                result_token = self.jumanpp_obj.query(input_string=input_str)\n                ml_token_object = MList(result_token)\n            else:\n                ml_token_object = MList(result_token)\n        elif isinstance(self.jumanpp_obj, JumanppClient):\n            server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern)\n            ml_token_object = MList(server_response)\n        else:\n            raise Exception('Not defined')\n\n        return ml_token_object\n\n    @on_timeout(limit=60)\n    def tokenize(self, sentence,\n                 normalize=True,\n                 is_feature=False,\n                 is_surface=False,\n                 return_list=False,\n                 func_normalizer=text_preprocess.normalize_text):\n        # type: (text_type, bool, bool, bool, bool, Callable[[text_type], text_type]) -> Union[TokenizedSenetence, List[text_type]]\n        \"\"\"* What you can do\n        -\n        \"\"\"\n        if normalize:\n            normalized_sentence = func_normalizer(sentence)\n        else:\n            normalized_sentence = sentence\n\n        ml_token_object = self.call_juman_interface(normalized_sentence)\n\n        token_objects = [\n            juman_utils.extract_morphological_information(\n                mrph_object=morph_object,\n                is_surface=is_surface,\n                is_feature=is_feature\n            )\n            for morph_object in ml_token_object]\n\n        if return_list:\n            tokenized_objects = TokenizedSenetence(\n                sentence=sentence,\n                tokenized_objects=token_objects)\n            return tokenized_objects.convert_list_object()\n        else:\n            tokenized_objects = TokenizedSenetence(\n                sentence=sentence,\n                tokenized_objects=token_objects)\n            return tokenized_objects\n\n    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):\n        # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type]) -> FilteredObject\n        assert isinstance(parsed_sentence, TokenizedSenetence)\n        assert isinstance(pos_condition, (type(None), list))\n        assert isinstance(stopwords, (type(None), list))\n\n        return  parsed_sentence.filter(pos_condition, stopwords)\n"
  },
  {
    "path": "JapaneseTokenizer/kytea_wrapper/__init__.py",
    "content": "__author__ = 'kensuke-mi'\nfrom .kytea_wrapper import KyteaWrapper"
  },
  {
    "path": "JapaneseTokenizer/kytea_wrapper/kytea_wrapper.py",
    "content": "# -*- coding: utf-8 -*-\nfrom JapaneseTokenizer.object_models import WrapperBase\nfrom JapaneseTokenizer.common import text_preprocess\nfrom JapaneseTokenizer.datamodels import FilteredObject, TokenizedResult, TokenizedSenetence\nfrom JapaneseTokenizer import init_logger\nfrom typing import List, Tuple, Any, Union, Callable\nfrom six import text_type, string_types\nimport logging\nimport sys\nimport six\n\nlogger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))\npython_version = sys.version_info\n\n\ntry:\n    import Mykytea\nexcept ImportError:\n    logger.warning(msg='Mykytea is not ready to use yet. Install first if you would like to use kytea wrapper.')\n\n__author__ = 'kensuke-mi'\n\n\nclass KyteaWrapper(WrapperBase):\n    def __init__(self,\n                 option_string='-deftag UNKNOWN!!'):\n        # type: (string_types)->None\n        # option string is argument of Kytea.\n        assert isinstance(option_string, string_types)\n        self.kytea = Mykytea.Mykytea(option_string)\n\n    def __list_tags(self, t):\n        def convert(t2): return (t2[0], t2[1])\n        return [(word.surface, [[convert(t2) for t2 in t1] for t1 in word.tag]) for word in t]\n\n    def __check_char_set(self, input_char):\n        # type: (text_type) -> text_type\n        if six.PY2 and isinstance(input_char, str):\n            return input_char.decode('utf-8')\n        elif isinstance(input_char, text_type):\n            return input_char\n        else:\n            raise Exception('nor unicode, str')\n\n    def __extract_morphological_information(self, kytea_tags_tuple, is_feature):\n        # type: (Tuple[text_type,List[Any]], bool) -> TokenizedResult\n        \"\"\"This method extracts morphlogical information from token object.\n        \"\"\"\n        assert isinstance(kytea_tags_tuple, tuple)\n        assert isinstance(is_feature, bool)\n\n        surface = self.__check_char_set(kytea_tags_tuple[0])\n        # NOTE: kytea does NOT show word stem. Put blank string instead.\n        if six.PY2:\n            word_stem = ''.decode('utf-8')\n        else:\n            word_stem = ''\n\n        pos_tuple = kytea_tags_tuple[1][0]\n        pos = self.__check_char_set(pos_tuple[0][0])\n        pos_score = float(pos_tuple[0][1])\n\n        yomi_tuple = kytea_tags_tuple[1][1]\n        yomi = self.__check_char_set(yomi_tuple[0][0])\n        yomi_score = float(yomi_tuple[0][1])\n\n        tuple_pos = (pos, )\n\n        misc_info = {\n            'pos_score': pos_score,\n            'pos': pos,\n            'yomi': yomi,\n            'yomi_score': yomi_score\n        }\n\n        token_object = TokenizedResult(\n            node_obj=None,\n            tuple_pos=tuple_pos,\n            word_stem=word_stem,\n            word_surface=surface,\n            is_feature=is_feature,\n            is_surface=True,\n            misc_info=misc_info\n        )\n\n        return token_object\n\n    def call_kytea_tokenize_api(self, sentence):\n        \"\"\"\n        \"\"\"\n        result = self.kytea.getTagsToString(sentence)\n        assert isinstance(result, text_type)\n\n        return result\n\n    def tokenize(self, sentence,\n                 normalize=True,\n                 is_feature=False,\n                 is_surface=False,\n                 return_list=False,\n                 func_normalizer=text_preprocess.normalize_text):\n        # type: (text_type, bool, bool, bool, bool, Callable[[str],str]) -> Union[List[str], TokenizedSenetence]\n        \"\"\"This method returns tokenized result.\n        If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.\n        If return_list==False, this method returns TokenizedSenetence object.\n        \"\"\"\n        assert isinstance(normalize, bool)\n        assert isinstance(sentence, text_type)\n        normalized_sentence = func_normalizer(sentence)\n        if six.PY2:\n            normalized_sentence = normalized_sentence.encode('utf-8')\n\n        result = self.__list_tags(self.kytea.getTags(normalized_sentence))\n\n        token_objects = [\n            self.__extract_morphological_information(\n                kytea_tags_tuple=kytea_tags,\n                is_feature=is_feature\n            )\n            for kytea_tags in result]\n\n        if return_list:\n            tokenized_objects = TokenizedSenetence(\n                sentence=sentence,\n                tokenized_objects=token_objects\n            )\n            return tokenized_objects.convert_list_object()\n        else:\n            tokenized_objects = TokenizedSenetence(\n                sentence=sentence,\n                tokenized_objects=token_objects)\n\n            return tokenized_objects\n\n    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):\n        assert isinstance(parsed_sentence, TokenizedSenetence)\n        assert isinstance(pos_condition, (type(None), list))\n        assert isinstance(stopwords, (type(None), list))\n\n        return parsed_sentence.filter(pos_condition, stopwords, check_field_name='surface')\n"
  },
  {
    "path": "JapaneseTokenizer/mecab_wrapper/__init__.py",
    "content": "__author__ = 'kensuke-mi'\nfrom .mecab_wrapper import MecabWrapper\n"
  },
  {
    "path": "JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py",
    "content": "#! -*- coding: utf-8 -*-\n# core module\nfrom JapaneseTokenizer.object_models import WrapperBase\nfrom JapaneseTokenizer.common.text_preprocess import normalize_text\nfrom JapaneseTokenizer import init_logger\nfrom JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject\nimport MeCab\n# else\nimport sys\nimport os\nimport logging\nimport subprocess\nimport six\nfrom six import text_type\n# typing\nfrom typing import List, Tuple, Union, TypeVar, Callable\nContentsTypes = TypeVar('T')\n\n__author__ = 'kensuke-mi'\n\nlogger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))\npython_version = sys.version_info\n\ntry:\n    import neologdn\n    is_neologdn_valid = True\nexcept:\n    logger.warning(\"neologdn package is not installed yet. You could not call neologd dictionary.\")\n    is_neologdn_valid = False\n\n\nclass MecabWrapper(WrapperBase):\n    def __init__(self,\n                 dictType,\n                 pathUserDictCsv=None,\n                 path_mecab_config=None,\n                 path_dictionary=None,\n                 string_encoding='utf-8'):\n        # type: (text_type, text_type, text_type, text_type, text_type)->None\n        \"\"\"\n\n        :param dictType: a dictionary type called by mecab\n        :param pathUserDictCsv: path to your original dictionary file\n        :param path_mecab_config: path to 'mecab_config' command. It's automatically detected if not give\n        :param path_dictionary: path to a dictionary which you want to use. If not given, it's automatically detected\n        :param string_encoding: encoding option to parse command line result. This is mainly used for python2.x\n        \"\"\"\n        self.string_encoding = string_encoding\n        self._dictType = dictType\n        self._pathUserDictCsv = pathUserDictCsv\n        self._path_dictionary = path_dictionary\n        if path_mecab_config is None:\n            self._path_mecab_config = self.__get_path_to_mecab_config()\n        else:\n            self._path_mecab_config = path_mecab_config\n\n        if self._path_dictionary is not None:\n            assert os.path.exists(self._path_dictionary), 'Path dictionary is NOT exist.'\n            self._mecab_dictionary_path = None\n        else:\n            self._mecab_dictionary_path = self.__check_mecab_dict_path()\n\n        logger.info(\"mecab dictionary path is detected under {}\".format(self._mecab_dictionary_path))\n        self.mecabObj = self.__CallMecab()\n\n        assert dictType in [\"neologd\", \"all\", \"ipadic\", \"ipaddic\", \"user\", \"\", \"jumandic\", \"unidic\", None], \\\n            'Dictionary Type Error. Your dict = {} is NOT available.'\n        if dictType == 'all':\n            logger.error('dictionary type \"all\" is deprecated from version1.6')\n            raise Exception('dictionary type \"all\" is deprecated from version1.6')\n        if dictType == 'user':\n            logger.error('dictionary type \"user\" is deprecated from version1.6. You just give path to dictionary csv.')\n            raise Exception('dictionary type \"all\" is deprecated from version1.6. You just give path to dictionary csv.')\n\n        if pathUserDictCsv is not None and isinstance(pathUserDictCsv, text_type) and pathUserDictCsv != '':\n            assert os.path.exists(pathUserDictCsv), \\\n                'Your user dictionary does NOT exist. Path={}'.format(pathUserDictCsv)\n\n    def __get_path_to_mecab_config(self):\n        \"\"\"You get path into mecab-config\n        \"\"\"\n        if six.PY2:\n            path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config'])\n            path_mecab_config_dir = path_mecab_config_dir.strip().replace('/mecab-config', '')\n        else:\n            path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config']).decode(self.string_encoding)\n            path_mecab_config_dir = path_mecab_config_dir.strip().replace('/mecab-config', '')\n\n        logger.info(msg='mecab-config is detected at {}'.format(path_mecab_config_dir))\n        return path_mecab_config_dir\n\n    def __check_mecab_dict_path(self):\n        \"\"\"check path to dict of Mecab in system environment\n        \"\"\"\n        mecab_dic_cmd = \"echo `{} --dicdir`\".format(os.path.join(self._path_mecab_config, 'mecab-config'))\n\n        try:\n            if six.PY2:\n                path_mecab_dict = subprocess.check_output( mecab_dic_cmd, shell=True  ).strip('\\n')\n            else:\n                path_mecab_dict = subprocess.check_output(mecab_dic_cmd, shell=True).decode(self.string_encoding).strip('\\n')\n\n        except subprocess.CalledProcessError:\n            logger.error(\"{}\".format(mecab_dic_cmd))\n            raise subprocess.CalledProcessError(returncode=-1, cmd=\"Failed to execute mecab-config command\")\n        if path_mecab_dict == '':\n            raise SystemError(\"\"\"mecab dictionary path is not found with following command: {} \n            You are not able to use additional dictionary. \n            Still you are able to call mecab default dictionary\"\"\".format(mecab_dic_cmd))\n\n        return path_mecab_dict\n\n    def __check_mecab_libexe(self):\n        mecab_libexe_cmd = \"echo `{} --libexecdir`\".format(os.path.join(self._path_mecab_config, 'mecab-config'))\n\n        try:\n            if six.PY2:\n                path_mecab_libexe = subprocess.check_output( mecab_libexe_cmd, shell=True  ).strip('\\n')\n            else:\n                path_mecab_libexe = subprocess.check_output(mecab_libexe_cmd, shell=True).decode(self.string_encoding).strip('\\n')\n\n        except subprocess.CalledProcessError:\n            logger.error(\"{}\".format(mecab_libexe_cmd))\n            raise subprocess.CalledProcessError(returncode=-1, cmd=\"Failed to execute mecab-config --libexecdir\")\n        if path_mecab_libexe == '':\n            raise SystemError(\"\"\"Mecab config is not callable with following command: {} \n            You are not able to compile your user dictionary. \n            Still, you are able to use default mecab dictionary.\"\"\".format(mecab_libexe_cmd))\n\n        return path_mecab_libexe\n\n    def __CallMecab(self):\n        if self._path_dictionary is not None and self._mecab_dictionary_path is None:\n            logger.debug('Use dictionary you specified.')\n            cmMecabInitialize = '-d {}'.format(self._path_dictionary)\n        elif self._dictType == 'neologd':\n            # use neologd\n            logger.debug('Use neologd additional dictionary')\n            cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, \"mecab-ipadic-neologd\"))\n        elif self._dictType == 'ipadic' or self._dictType == 'ipaddic':\n            # use ipadic\n            logger.debug('Use ipadic dictionary')\n            cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, \"ipadic\"))\n        elif six.PY2 is False and self._dictType == 'jumandic':\n            # use jumandic. This is impossible to call in Python2.x\n            logger.debug('Use jumandic dictionary')\n            cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, \"jumandic\"))\n        elif six.PY2 and self._dictType == 'jumandic':\n            raise Exception('In python2.x, impossible to call jumandic.')\n        else:\n            logger.debug('Use no default dictionary')\n            cmMecabInitialize = ''\n\n        # execute compile if user dictionary is given\n        if self._pathUserDictCsv is not None:\n            logger.debug('Use User dictionary')\n            pathUserDict = self.__CompileUserdict()\n            cmMecabInitialize += ' -u {}'.format(pathUserDict)\n\n        if six.PY2:\n            cmMecabCall = \"-Ochasen {}\".format(cmMecabInitialize)\n        else:\n            cmMecabCall = \"{}\".format(cmMecabInitialize)\n        logger.debug(msg=\"mecab initialized with {}\".format(cmMecabCall))\n\n        try:\n            mecabObj = MeCab.Tagger(cmMecabCall)\n        except Exception as e:\n            logger.error(e.args)\n            logger.error(\"Possibly Path to userdict is invalid. Check the path\")\n            raise subprocess.CalledProcessError(returncode=-1, cmd=\"Failed to initialize Mecab object\")\n\n        return mecabObj\n\n    def __CompileUserdict(self):\n        \"\"\"* What you can do\n        \"\"\"\n        path_mecab_dict = self.__check_mecab_dict_path()\n        path_mecab_libexe = self.__check_mecab_libexe()\n\n        cmCompileDict = u'{0}/mecab-dict-index -d {1}/ipadic -u {2} -f utf-8 -t utf-8 {3} > /dev/null'.format(path_mecab_libexe,\n                                                                                                            path_mecab_dict,\n                                                                                                            self._pathUserDictCsv.replace(\"csv\", \"dict\"),\n                                                                                                            self._pathUserDictCsv)\n        logger.debug(msg=\"compiling mecab user dictionary with: {}\".format(cmCompileDict))\n        try:\n            subprocess.call( cmCompileDict , shell=True )\n        except OSError as e:\n            logger.error('type:' + str(type(e)))\n            logger.error('args:' + str(e.args))\n            sys.exit('Failed to compile mecab userdict. System ends')\n\n        return self._pathUserDictCsv.replace(\"csv\", \"dict\")\n\n    def __feature_parser(self, uni_feature, word_surface):\n        \"\"\"\n        Parse the POS feature output by Mecab\n        :param uni_feature unicode:\n        :return ( (pos1, pos2, pos3), word_stem ):\n        \"\"\"\n        list_feature_items = uni_feature.split((','))\n        # if word has no feature at all\n        if len(list_feature_items)==1: return ('*'), ('*')\n\n        pos1 = list_feature_items[0]\n        pos2 = list_feature_items[1]\n        pos3 = list_feature_items[2]\n        tuple_pos = ( pos1, pos2, pos3 )\n\n        # if without constraint(output is normal mecab dictionary like)\n        if len(list_feature_items) == 9:\n            word_stem = list_feature_items[6]\n        # if with constraint(output format depends on Usedict.txt)\n        else:\n            word_stem = word_surface\n\n        return tuple_pos, word_stem\n\n    def __postprocess_analyzed_result(self, string_mecab_parsed_result, is_feature, is_surface):\n        # type: (text_type,bool,bool)->List[TokenizedResult]\n        \"\"\"Extract surface word and feature from analyzed lines.\n        Extracted results are returned with list, whose elements are TokenizedResult class\n        [TokenizedResult]\n        \"\"\"\n        assert isinstance(string_mecab_parsed_result, str)\n        check_tab_separated_line = lambda x: True if '\\t' in x else False\n\n        tokenized_objects = [\n            self.__result_parser(analyzed_line=analyzed_line,\n                                 is_feature=is_feature,\n                                 is_surface=is_surface)\n            for analyzed_line in string_mecab_parsed_result.split('\\n')\n            if not analyzed_line=='EOS' and check_tab_separated_line(analyzed_line)\n        ]\n\n        assert isinstance(tokenized_objects, list)\n        return tokenized_objects\n\n    def __result_parser(self, analyzed_line, is_feature, is_surface):\n        # type: (text_type,bool,bool)->TokenizedResult\n        \"\"\"Extract surface word and feature from analyzed line.\n        Extracted elements are returned with TokenizedResult class\n        \"\"\"\n        assert isinstance(analyzed_line, str)\n        assert isinstance(is_feature, bool)\n        assert isinstance(is_surface, bool)\n\n        surface, features = analyzed_line.split('\\t', 1)\n        tuple_pos, word_stem = self.__feature_parser(features, surface)\n        tokenized_obj = TokenizedResult(\n            node_obj=None,\n            analyzed_line=analyzed_line,\n            tuple_pos=tuple_pos,\n            word_stem=word_stem,\n            word_surface=surface,\n            is_feature=is_feature,\n            is_surface=is_surface\n        )\n        return tokenized_obj\n\n    def tokenize(self, sentence,\n                 normalized=True,\n                 is_feature=False,\n                 is_surface=False,\n                 return_list=False,\n                 func_normalizer=normalize_text):\n        # type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence]\n        \"\"\"* What you can do\n        - Call mecab tokenizer, and return tokenized objects\n\n        \"\"\"\n        if six.PY2 and isinstance(sentence, str):\n            sentence = sentence.decode(self.string_encoding)\n        else:\n            pass\n\n        # decide normalization function depending on dictType\n        if func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid:\n            normalized_sentence = neologdn.normalize(sentence)\n        elif func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid == False:\n            raise Exception(\"You could not call neologd dictionary bacause you do NOT install the package neologdn.\")\n        elif func_normalizer == normalize_text:\n            normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType)\n        elif func_normalizer is None:\n            normalized_sentence = sentence\n        else:\n            normalized_sentence = func_normalizer(sentence)\n\n        # don't delete this variable. The variable \"encoded_text\" protects sentence from deleting\n        if six.PY2:\n            encoded_text = normalized_sentence.encode(self.string_encoding)\n        else:\n            encoded_text = normalized_sentence\n\n        if six.PY2:\n            tokenized_objects = []\n            node = self.mecabObj.parseToNode(encoded_text)\n            node = node.next\n            while node.next is not None:\n                word_surface = node.surface.decode(self.string_encoding)\n\n                tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface)\n\n                tokenized_obj = TokenizedResult(\n                    node_obj=node,\n                    tuple_pos=tuple_pos,\n                    word_stem=word_stem,\n                    word_surface=word_surface,\n                    is_feature=is_feature,\n                    is_surface=is_surface\n                )\n                tokenized_objects.append(tokenized_obj)\n                node = node.next\n\n            tokenized_sentence = TokenizedSenetence(\n                sentence=sentence,\n                tokenized_objects=tokenized_objects)\n        else:\n            parsed_result = self.mecabObj.parse(encoded_text)\n            tokenized_objects = self.__postprocess_analyzed_result(\n                string_mecab_parsed_result=parsed_result,\n                is_feature=is_feature,\n                is_surface=is_surface\n            )\n            tokenized_sentence = TokenizedSenetence(\n                sentence=sentence,\n                tokenized_objects=tokenized_objects\n            )  # type: TokenizedSenetence\n\n        if return_list:\n            return tokenized_sentence.convert_list_object()\n        else:\n            return tokenized_sentence\n\n    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):\n        # type: (TokenizedSenetence, List[Tuple[str,...]], List[str]) -> FilteredObject\n        assert isinstance(parsed_sentence, TokenizedSenetence)\n        assert isinstance(pos_condition, (type(None), list))\n        assert isinstance(stopwords, (type(None), list))\n        return parsed_sentence.filter(pos_condition, stopwords)\n"
  },
  {
    "path": "JapaneseTokenizer/object_models.py",
    "content": "#! -*- coding: utf-8 -*-\nfrom typing import Callable\nfrom six import text_type\n\nclass WrapperBase(object):\n    def tokenize(self,\n                 sentence,\n                 normalize,\n                 is_feature,\n                 is_surface,\n                 return_list,\n                 func_normalizer=None):\n        # type: (text_type, bool, bool, bool, bool, Callable[[text_type], text_type])->None\n        \"\"\"* What you can do\"\"\"\n        raise NotImplemented\n\n    def filter(self, parsed_sentence, pos_condition=None, stopwords=None):\n        raise NotImplemented\n"
  },
  {
    "path": "LICENSE.txt",
    "content": "Copyright 2017 Kensuke Mitsuzawa\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \"Software\"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE."
  },
  {
    "path": "MANIFEST.in",
    "content": "include README.md\ninclude README_JP.md\ninclude examples\ninclude test\ninclude install_tokenizers.sh\ninclude LICENSE.txt\ninclude Makefile\n"
  },
  {
    "path": "Makefile",
    "content": "install:\n\tbash install_tokenizers.sh\n\ninstall_neologd:\n\t## mecab-neologdのインストールを実行\n\twget --no-check-certificate https://github.com/neologd/mecab-ipadic-neologd/tarball/master -O mecab-ipadic-neologd.tar\n\ttar -xvf mecab-ipadic-neologd.tar\n\tmv neologd-mecab-ipadic-neologd-* neologd-mecab-ipadic-neologd && cd neologd-mecab-ipadic-neologd && ( echo yes | ./bin/install-mecab-ipadic-neologd )"
  },
  {
    "path": "README.md",
    "content": "[![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](LICENSE)[![Build Status](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers.svg?branch=master)](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers)\n\n\n# What's this?\n\nThis is simple python-wrapper for Japanese Tokenizers(A.K.A Tokenizer)\n\nThis project aims to call tokenizers and split a sentence into tokens as easy as possible.\n\nAnd, this project supports various Tokenization tools common interface. Thus, it's easy to compare output from various tokenizers.\n\nThis project is available also in [Github](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers).  \n\nIf you find any bugs, please report them to github issues. Or any pull requests are welcomed!\n\n# Requirements\n\n- Python 2.7\n- Python 3.x\n    - checked in 3.5, 3.6, 3.7  \n\n\n# Features\n\n* simple/common interface among various tokenizers\n* simple/common interface for filtering with stopwords or Part-of-Speech condition \n* simple interface to add user-dictionary(mecab only)\n\n## Supported Tokenizers\n\n### Mecab\n\n[Mecab](http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html?sess=3f6a4f9896295ef2480fa2482de521f6) is open source tokenizer system for various language(if you have dictionary for it)\n\nSee [english documentation](https://github.com/jordwest/mecab-docs-en) for detail\n\n### Juman\n\n[Juman](http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?JUMAN) is a tokenizer system developed by Kurohashi laboratory, Kyoto University, Japan.\n\nJuman is strong for ambiguous writing style in Japanese, and is strong for new-comming words thanks to Web based huge dictionary.\n \nAnd, Juman tells you semantic meaning of words.\n\n### Juman++\n\n[Juman++](http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?JUMAN++) is a tokenizer system developed by Kurohashi laboratory, Kyoto University, Japan.\n\nJuman++ is succeeding system of Juman. It adopts RNN model for tokenization.\n\nJuman++ is strong for ambigious writing style in Japanese, and is strong for new-comming words thanks to Web based huge dictionary.\n \nAnd, Juman tells you semantic meaning of words.\n\nNote: New Juman++ dev-version(later than 2.x) is available at [Github](https://github.com/ku-nlp/jumanpp)\n\n\n### Kytea\n\n[Kytea](http://www.phontron.com/kytea/) is tokenizer tool developped by Graham Neubig.\n\nKytea has a different algorithm from one of Mecab or Juman. \n\n \n# Setting up\n\n## Tokenizers auto-install\n\n```\nmake install\n```\n\n### mecab-neologd dictionary auto-install\n\n```\nmake install_neologd\n```\n\n## Tokenizers manual-install\n\n### MeCab\n\nSee [here](https://github.com/jordwest/mecab-docs-en) to install MeCab system.\n\n### Mecab Neologd dictionary\n\nMecab-neologd dictionary is a dictionary-extension based on ipadic-dictionary, which is basic dictionary of Mecab.\n\nWith, Mecab-neologd dictionary, you're able to parse new-coming words make one token.\n\nHere, new-coming words is such like, movie actor name or company name.....\n\nSee [here](https://github.com/neologd/mecab-ipadic-neologd) and install mecab-neologd dictionary.\n\n### Juman\n\n```\nwget -O juman7.0.1.tar.bz2 \"http://nlp.ist.i.kyoto-u.ac.jp/DLcounter/lime.cgi?down=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2&name=juman-7.01.tar.bz2\"\nbzip2 -dc juman7.0.1.tar.bz2  | tar xvf -\ncd juman-7.01\n./configure\nmake   \n[sudo] make install\n```    \n    \n\n## Juman++\n\n* GCC version must be >= 5\n\n```\nwget http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz\ntar xJvf jumanpp-1.02.tar.xz\ncd jumanpp-1.02/\n./configure\nmake\n[sudo] make install\n```\n    \n## Kytea\n\nInstall Kytea system\n\n```\nwget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz\ntar -xvf kytea-0.4.7.tar\ncd kytea-0.4.7\n./configure\nmake\nmake install\n```    \n\n\nKytea has [python wrapper](https://github.com/chezou/Mykytea-python) thanks to michiaki ariga.\nInstall Kytea-python wrapper\n\n```\npip install kytea\n```\n    \n\n## install\n\n```\n[sudo] python setup.py install\n```\n\n### Note\n\nDuring install, you see warning message when it fails to install `pyknp` or `kytea`.\n\nif you see these messages, try to re-install these packages manually.\n\n# Usage\n\nTokenization Example(For python3.x. To see exmaple code for Python2.x, plaese see [here](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers/blob/master/examples/examples.py))\n\n```\nimport JapaneseTokenizer\ninput_sentence = '10日放送の「中居正広のミになる図書館」（テレビ朝日系）で、SMAPの中居正広が、篠原信一の過去の勘違いを明かす一幕があった。'\n# ipadic is well-maintained dictionary #\nmecab_wrapper = JapaneseTokenizer.MecabWrapper(dictType='ipadic')\nprint(mecab_wrapper.tokenize(input_sentence).convert_list_object())\n\n# neologd is automatically-generated dictionary from huge web-corpus #\nmecab_neologd_wrapper = JapaneseTokenizer.MecabWrapper(dictType='neologd')\nprint(mecab_neologd_wrapper.tokenize(input_sentence).convert_list_object())\n```\n\n\n## Filtering example\n\n```\nimport JapaneseTokenizer\n# with word filtering by stopword & part-of-speech condition #\nprint(mecab_wrapper.tokenize(input_sentence).filter(stopwords=['テレビ朝日'], pos_condition=[('名詞', '固有名詞')]).convert_list_object())\n```\n\n\n## Part-of-speech structure\n\nMecab, Juman, Kytea have different system of Part-of-Speech(POS).\n\nYou can check tables of Part-of-Speech(POS) [here](http://www.unixuser.org/~euske/doc/postag/)\n\n\n# Similar Package\n\n\n## natto-py\n\nnatto-py is sophisticated package for tokenization. It supports following features\n\n* easy interface for tokenization\n* importing additional dictionary\n* partial parsing mode\n\n# LICENSE\n\nMIT license\n\n# For developers\n\nYou could build an environment which has dependencies to test this package.\n\nSimply, you build docker image and run docker container.\n\n## Dev environment\n\nDevelop environment is defined with `test/docker-compose-dev.yml`.\n\nWith the docker-compose.yml file, you could call python2.7 or python3.7\n\nIf you're using Pycharm Professional edition, you could set docker-compose.yml as remote interpreter.\n\nTo call python2.7, set `/opt/conda/envs/p27/bin/python2.7`\n\nTo call python3.7, set `/opt/conda/envs/p37/bin/python3.7`\n\n## Test environment\n\nThese commands checks from procedures of package install until test of package.\n\n```bash\n$ docker-compose build\n$ docker-compose up\n```\n\n"
  },
  {
    "path": "examples/examples.py",
    "content": "#! -*- coding: utf-8 -*-\nimport sys\nimport os\nfrom JapaneseTokenizer import JumanWrapper\nfrom JapaneseTokenizer import JumanppWrapper\nfrom JapaneseTokenizer import MecabWrapper\nfrom JapaneseTokenizer import KyteaWrapper\nfrom JapaneseTokenizer.datamodels import TokenizedResult\nfrom JapaneseTokenizer import init_logger\nimport logging\nimport socket\nimport six\nlogger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))\n__author__ = 'kensuke-mi'\nlogger.setLevel(logging.DEBUG)\n\n# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n# for python2.x\n\ndef basic_example():\n    # ========================================================\n    # TOKENIZE\n    # ========================================================\n    if six.PY2:\n        # input is `unicode` type(in python2x)\n        sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'\n    elif six.PY3:\n        sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'\n    else:\n        raise Exception()\n\n    # make MecabWrapper object\n    # you can choose from \"neologd\", \"all\", \"ipadic\", \"user\", \"\", None\n    # \"ipadic\" and \"\" is equivalent\n    mecab_wrapper = MecabWrapper(dictType=\"neologd\")\n    juman_wrapper = JumanWrapper()\n    jumanpp_wrapper = JumanppWrapper()\n    #kytea_wrapper = KyteaWrapper()\n\n    # tokenize sentence into list of token.\n    # with is_feature=True, you get part-of-speech tag also. in this case, you get tuple ( token, (part-of-speech-tags) )\n    # with is_surface=True, you get surface form of token (in other words, not normalized token)\n    seq_tokens_mecab = mecab_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object()\n    seq_tokens_juman = juman_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object()\n    seq_tokens_jumanpp = jumanpp_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object()\n    #seq_tokens_kytea = kytea_wrapper.tokenize(sentence=sentence, is_feature=True, is_surface=False).convert_list_object()\n\n    logger.debug(seq_tokens_mecab)\n    logger.debug(seq_tokens_juman)\n    logger.debug(seq_tokens_jumanpp)\n    #logger.debug(seq_tokens_kytea)\n\ndef filtering_example():\n    if six.PY2:\n        # input is `unicode` type(in python2x)\n        sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'\n        stopwords = [u'テヘラン']\n        pos_condition_ipadic = [(u'名詞', u'固有名詞'), (u'名詞', u'一般')]\n        pos_condition_juman = [(u'名詞', u'固有名詞'), (u'名詞', u'普通名詞')]\n        pos_condition_kytea = [(u'名詞',)]\n    elif six.PY3:\n        sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'\n        stopwords = ['テヘラン']\n        pos_condition_ipadic = [('名詞', '固有名詞'), ('名詞', '一般')]\n        pos_condition_juman = [('名詞', '固有名詞'), ('名詞', '普通名詞')]\n        pos_condition_kytea = [('名詞',)]\n    else:\n        raise Exception()\n\n    # ========================================================\n    # FILTERING\n    # ========================================================\n    # you can filter tokens by stopwords or POS conditions\n    # stopword is list objetc\n\n    mecab_wrapper = MecabWrapper(dictType=\"neologd\")\n    juman_wrapper = JumanWrapper()\n    jumanpp_wrapper = JumanppWrapper()\n    #kytea_wrapper = KyteaWrapper()\n    seq_tokens_mecab = mecab_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_ipadic,stopwords=stopwords).convert_list_object()\n    seq_tokens_juman = juman_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_juman, stopwords=stopwords).convert_list_object()\n    seq_tokens_jumanpp = jumanpp_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_juman, stopwords=stopwords).convert_list_object()\n    #seq_tokens_kytea = kytea_wrapper.tokenize(sentence=sentence, is_feature=True, is_surface=False).filter(pos_condition=pos_condition_kytea, stopwords=stopwords).convert_list_object()\n\n    logger.debug(seq_tokens_mecab)\n    logger.debug(seq_tokens_juman)\n    logger.debug(seq_tokens_jumanpp)\n    #logger.debug(seq_tokens_kytea)\n\n\ndef advanced_example_mecab():\n    if six.PY2:\n        sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'\n    elif six.PY3:\n        sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'\n    else:\n        raise Exception()\n\n    # ========================================================\n    # USE YOUE OWN DICTIONARY\n    # with your own dictionary, you can force Mecab to make some word into one token\n    # ========================================================\n    # make your own \"user dictionary\" with CSV file\n    # To know more about this file, see this page(sorry, Japanese only) https://mecab.googlecode.com/svn/trunk/mecab/doc/dic.html\n    example_user_dict = \"userdict.csv\"\n\n    # set dictType='user' or dictType='all' and set pathUserDictCsv\n    tokenized_obj = MecabWrapper(dictType='user', pathUserDictCsv=example_user_dict).tokenize(sentence)\n\n    for token_obj in tokenized_obj.tokenized_objects:\n        assert isinstance(token_obj, TokenizedResult)\n        if six.PY2 and token_obj.word_stem == u'ペルシア語':\n            logger.debug(token_obj.word_stem)\n        elif six.PY3 and token_obj.word_stem == 'ペルシア語':\n            logger.debug(token_obj.word_stem)\n\n        ## TokenizedResult class has attributes of tokenized result ##\n        token_obj.analyzed_line\n        token_obj.word_surface\n        token_obj.word_stem\n        token_obj.tuple_pos\n\n\ndef advanced_example_juman():\n    if six.PY2:\n        sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'\n        pos_condition = [(u'名詞',)]\n    elif six.PY3:\n        sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'\n        pos_condition = [('名詞',)]\n    else:\n        raise Exception()\n\n    ### You can call juman with server mode. You must start JUMAN as server mode beforehand ###\n    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n    HOST='localhost'\n    PORT = 32000\n    try:\n        s.connect((HOST, PORT))\n        s.close()\n        juman_wrapper = JumanWrapper(server=HOST, port=PORT)\n        tokens_list = juman_wrapper.tokenize(sentence, return_list=False).filter(pos_condition=pos_condition).convert_list_object()\n        assert isinstance(tokens_list, list)\n    except:\n        logger.info(msg='Juman server is not running. Skip it.')\n\n\nif __name__ == \"__main__\":\n    basic_example()\n    filtering_example()\n    advanced_example_mecab()\n    advanced_example_juman()"
  },
  {
    "path": "examples/userdict.csv",
    "content": "ペルシア語,-1,-1,-400,名詞,一般,*,*,*,*,ぺるしあご,*,*,*"
  },
  {
    "path": "install_tokenizers.sh",
    "content": "#!/bin/bash\nos_type=`uname`\necho \"os-type is \"$os_type\nif [ `uname` = \"Darwin\" ]; then\n    #mac用のコード\n    juman_utils_bin=\"/usr/local/opt/juman/libexec/juman/\"\n    if [ -e ${juman_utils_bin} ]; then\n        :\n    else\n        juman_utils_bin=\"/usr/local/libexec/juman/\"\n    fi\nelif [ `uname` = \"Linux\" ]; then\n    #Linux用のコード\n    juman_utils_bin=\"/usr/local/libexec/juman/\"\nelse\n    echo \"Your platform ($(uname -a)) is not supported.\"\n    exit 1\nfi\n\nWORK_DIR=`pwd`\necho 'これはテスト' | mecab\nis_mecab_install=$?\n\nif [ $is_mecab_install -eq 127 ]; then\n    ## mecab\n    wget -O mecab-0.996.tar.gz \"https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE\"\n    tar zxvf mecab-0.996.tar.gz\n    cd mecab-0.996 && ./configure && make && make install\n    cd $WORK_DIR\n\n    ### mecabインストール後にldconfigを実行\n    ldconfig\n\n    ## mecab ipadic\n    wget -O mecab-ipadic-2.7.0-20070801.tar.gz \"https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM\"\n    tar zxvf mecab-ipadic-2.7.0-20070801.tar.gz\n    cd mecab-ipadic-2.7.0-20070801 &&./configure --with-charset=utf8 && make && make install\n    # 動作テスト\n    echo 'インストール後のテスト' | mecab\nelse\n    :\nfi\n\necho 'これはテスト' | juman\nis_juman_install=$?\n\nif [ $is_juman_install -eq 127 ]; then\n    ## juman\n    wget -O juman7.0.1.tar.bz2 \"http://nlp.ist.i.kyoto-u.ac.jp/DLcounter/lime.cgi?down=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2&name=juman-7.01.tar.bz2\"\n    bzip2 -dc juman7.0.1.tar.bz2  | tar xvf -\n    cd juman-7.01 && ./configure && make && make install\n\n    # インストール後のldconfig\n    ldconfig\n    # 動作テスト\n    echo 'インストール後のテスト' | juman\nelse\n    :\nfi\n\necho 'これはテスト' | jumanpp\nis_jumanpp_install=$?\n\nif [ $is_jumanpp_install -eq 127 ]; then\n    # jumanpp\n    wget http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.01.tar.xz\n    tar xJvf jumanpp-1.01.tar.xz\n    cd jumanpp-1.01/ && ./configure && make && make install\n    # todo jumanppのサーバー起動スクリプト実施\n\n    # インストール後のldconfig\n    ldconfig\n    # 動作テスト\n    echo 'インストール後のテスト' | jumanpp\nelse\n    :\nfi\n\n\necho 'これはテスト' | kytea\nis_kytea_install=$?\n\nif [ $is_kytea_install -eq 127 ]; then\n    # kytea\n    wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz -O kytea-0.4.7.tar.gz\n    tar -xvf kytea-0.4.7.tar.gz\n    cd kytea-0.4.7 && ./configure && make && make install\n    # インストール後のldconfig\n    ldconfig\n    # 動作テスト\n    echo 'インストール後のテスト' | kytea\nelse\n    :\nfi\n\n\nif [ -f ./juman7.0.1.tar.bz2 ]; then\n    # juman\n\trm juman7.0.1.tar.bz2\nelse\n    :\nfi\n\nif [ -f ./mecab-*.tar.gz ]; then\n    # juman\n\trm mecab-*.tar.gz\nelse\n    :\nfi\n\nif [ -f ./mecab-ipadic-*.tar.gz ]; then\n\t# mecab-ipadic\n\trm mecab-ipadic-*.tar.gz\nelse\n    :\nfi\n\n\nif [ -f ./jumanpp-1.01.tar.xz ]; then\n\t# jumanpp\n\trm jumanpp-1.01.tar.xz\nelse\n    :\nfi\n\n\nif [ -f ./kytea-0.4.7.tar ]; then\n\t# kytea\n\trm kytea-0.4.7.tar\nelse\n    :\nfi\n\n\nif [ -d ./juman-7* ]; then\n\t# kytea\n\trm -rf juman-7*\nelse\n    :\nfi\n\nif [ -d ./mecab-0* ]; then\n\t# kytea\n\trm -rf mecab-0*\nelse\n    :\nfi\n\nif [ -d ./mecab-ipadic-* ]; then\n\trm -rf mecab-ipadic-*\nelse\n    :\nfi\n\nif [ -d ./jumanpp-1.01 ]; then\n\trm -rf jumanpp-1.01\nelse\n    :\nfi\n\nif [ -d ./kytea-0.4.7 ]; then\n\trm -rf kytea-0.4.7\nelse\n    :\nfi"
  },
  {
    "path": "setup.py",
    "content": "#! -*- coding: utf-8 -*-\nfrom setuptools import setup, find_packages\nimport sys\nimport logging\nimport codecs\nlogger = logging.getLogger(__file__)\n\npython_version = sys.version_info\n\n# --------------------------------------------------------------------------------------------------------\n# try to install kytea automatically because it usually causes to error during installing\ntry:\n    import Mykytea\nexcept ImportError:\n    try:\n        import sys\n        import subprocess\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kytea'])\n        import Mykytea\n    except Exception as e:\n        logger.error('We failed to install mykytea automatically. Try installing kytea manually.')\n        logger.error(e)\n\n# --------------------------------------------------------------------------------------------------------\ntry:\n    import neologdn\nexcept ImportError:\n    try:\n        import sys\n        import subprocess\n        subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'neologdn'])\n        import neologdn\n    except Exception as e:\n        logger.error('We failed to install neologdn automatically because of some issues in the package. Try installing pyknp manually.')\n        logger.error(e)\n\n# --------------------------------------------------------------------------------------------------------\n\ncommon_packages = ['pypandoc', 'future', 'six', 'jaconv>=0.2', 'pip>=8.1.0', 'pexpect', 'pyknp>=0.4.1']\nif python_version >= (3, 0, 0):\n    if python_version <= (3, 5, 0):\n        common_packages.append('typing')\n    elif python_version > (3, 5, 0):\n        common_packages.append('mecab-python3')\nelif python_version <= (2, 9, 9):\n    common_packages.append('typing')\n    common_packages.append('mecab-python')\nelse:\n    raise NotImplementedError()\n\nversion = '1.6'\nname = 'JapaneseTokenizer'\nshort_description = '`JapaneseTokenizer` is a package for easy Japanese Tokenization'\n\ntry:\n    import pypandoc\n    long_description = pypandoc.convert('README.md', 'rst')\nexcept(IOError, ImportError):\n    long_description = codecs.open('README.md', 'r', 'utf-8').read()\n\nclassifiers = [\n        \"Development Status :: 5 - Production/Stable\",\n        \"License :: OSI Approved :: MIT License\",\n        \"Programming Language :: Python\",\n        \"Natural Language :: Japanese\",\n        \"Topic :: Scientific/Engineering :: Artificial Intelligence\",\n        \"Programming Language :: Python :: 2.7\",\n        \"Programming Language :: Python :: 3.5\"\n        ]\n\nsetup(\n    author='Kensuke Mitsuzawa',\n    author_email='kensuke.mit@gmail.com',\n    name = name,\n    version=version,\n    short_description=short_description,\n    long_description=long_description,\n    keywords=['MeCab', '和布蕪', 'Juman',\n                'Japanese morphological analyzer', 'NLP', '形態素解析', '自然言語処理'],\n    license=\"MIT\",\n    url = \"https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers\",\n    test_suite='test.test_all.suite',\n    install_requires=common_packages,\n    tests_require=common_packages,\n    packages=find_packages()\n)\n"
  },
  {
    "path": "test/Dockerfile",
    "content": "FROM frolvlad/alpine-glibc:alpine-3.6\nMAINTAINER kensuke-mi <kensuke.mit@gmail.com>\n\n# Mecab install\nENV MECAB_VERSION 0.996\nENV IPADIC_VERSION 2.7.0-20070801\nENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE\nENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM\nENV build_deps 'curl git bash file sudo openssh gcc make build-base'\nENV dependencies 'openssl'\n\nENV PATH=/opt/conda/bin:$PATH \\\n    LANG=C.UTF-8 \\\n    MINICONDA=Miniconda3-latest-Linux-x86_64.sh\n# apk update\nRUN apk update\n\n# mecab\nRUN apk add --update --no-cache ${build_deps} \\\n  # Install dependencies\n  && apk add --update --no-cache ${dependencies} \\\n  # Install MeCab\n  && curl -SL -o mecab-${MECAB_VERSION}.tar.gz ${mecab_url} \\\n  && tar zxf mecab-${MECAB_VERSION}.tar.gz \\\n  && cd mecab-${MECAB_VERSION} \\\n  && ./configure --enable-utf8-only --with-charset=utf8 \\\n  && make \\\n  && make install \\\n  && cd \\\n  # Install IPA dic\n  && curl -SL -o mecab-ipadic-${IPADIC_VERSION}.tar.gz ${ipadic_url} \\\n  && tar zxf mecab-ipadic-${IPADIC_VERSION}.tar.gz \\\n  && cd mecab-ipadic-${IPADIC_VERSION} \\\n  && ./configure --with-charset=utf8 \\\n  && make \\\n  && make install \\\n  && cd \\\n  # Install Neologd\n  && git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \\\n  && mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \\\n  && rm -rf \\\n    mecab-${MECAB_VERSION}* \\\n    mecab-${IPADIC_VERSION}* \\\n    mecab-ipadic-neologd\n\n# general\nRUN apk --no-cache add vim \\\nwget \\\nlsof \\\ncurl \\\nbash \\\nswig \\\ngcc \\\nbuild-base \\\nmake \\\npython-dev \\\npy-pip \\\njpeg-dev \\\nzlib-dev \\\ngit \\\nlinux-headers\nENV LIBRARY_PATH=/lib:/usr/lib\n\nENV PLANTUML_VERSION 1.2017.18\nENV PLANTUML_DOWNLOAD_URL https://sourceforge.net/projects/plantuml/files/plantuml.$PLANTUML_VERSION.jar/download\nENV PANDOC_VERSION 1.19.2.4\nENV PANDOC_DOWNLOAD_URL https://hackage.haskell.org/package/pandoc-$PANDOC_VERSION/pandoc-$PANDOC_VERSION.tar.gz\nENV PANDOC_ROOT /usr/local/pandoc\n\nENV PATH $PATH:$PANDOC_ROOT/bin\n\n# Create Pandoc build space\nRUN mkdir -p /pandoc-build\nWORKDIR /pandoc-build\n\n# Install/Build Packages\nRUN apk upgrade --update && \\\n    apk add --no-cache --virtual .build-deps $BUILD_DEPS && \\\n    apk add --no-cache --virtual .persistent-deps $PERSISTENT_DEPS && \\\n    curl -fsSL \"$PLANTUML_DOWNLOAD_URL\" -o /usr/local/plantuml.jar && \\\n    apk add --no-cache --virtual .edge-deps $EDGE_DEPS -X http://dl-cdn.alpinelinux.org/alpine/edge/community && \\\n    curl -fsSL \"$PANDOC_DOWNLOAD_URL\" | tar -xzf - && \\\n        ( cd pandoc-$PANDOC_VERSION && cabal update && cabal install --only-dependencies && \\\n        cabal configure --prefix=$PANDOC_ROOT && \\\n        cabal build && \\\n        cabal copy && \\\n        cd .. ) && \\\n    rm -Rf pandoc-$PANDOC_VERSION/ && \\\n    rm -Rf /root/.cabal/ /root/.ghc/ && \\\n    rmdir /pandoc-build && \\\n    set -x; \\\n    addgroup -g 82 -S www-data; \\\n    adduser -u 82 -D -S -G www-data www-data && \\\n    mkdir -p /var/docs && \\\n    apk del .build-deps .edge-deps\n\n# Juman\nRUN wget http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2 \\\n    && tar xvf juman-7.01.tar.bz2 \\\n    && cd juman-7.01 \\\n    && ./configure \\\n    && make \\\n    && make install \\\n    && cd .. \\\n    && rm -rf juman-7.01 \\\n    && rm juman-7.01.tar.bz2\n\n# Juman++\nRUN apk add --update --no-cache --virtual=build-deps \\\n    boost-dev g++ make \\\n    && wget -q http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz \\\n    && tar Jxfv jumanpp-1.02.tar.xz \\\n    && cd jumanpp-1.02/ \\\n    && ./configure \\\n    && make \\\n    && make install \\\n    && cd .. \\\n    && rm jumanpp-1.02.tar.xz \\\n    && rm -rf /var/cache/* \\\n    && apk del build-deps \\\n    && apk add --update --no-cache boost\n\n# kytea\nRUN wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz \\\n    && tar -xvf kytea-0.4.7.tar.gz \\\n    && cd kytea-0.4.7 \\\n    && ./configure \\\n    && make \\\n    && make install\n\n# Python\nRUN apk add --no-cache bash wget && \\\n    wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \\\n    bash $MINICONDA -b -p /opt/conda && \\\n    ln -s /opt/conda/bin/* /usr/local/bin/ && \\\n    rm -rf /root/.[acpw]* $MINICONDA /opt/conda/pkgs/*\n\nRUN conda config --add channels conda-forge --system\nRUN conda create -y -n p27 python=2.7\nRUN conda create -y -n p36 python=3.6\nRUN conda create -y -n p37 python=3.7\n\n#RUN source activate p27\n#RUN source deactivate\n\nCMD [\"/bin/bash\"]"
  },
  {
    "path": "test/Dockerfile-dev",
    "content": "FROM frolvlad/alpine-glibc:alpine-3.6\nMAINTAINER kensuke-mi <kensuke.mit@gmail.com>\n\n# Mecab install\nENV MECAB_VERSION 0.996\nENV IPADIC_VERSION 2.7.0-20070801\nENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE\nENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM\nENV jumandic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM\nENV unidic_url https://unidic.ninjal.ac.jp/unidic_archive/cwj/2.3.0/unidic-cwj-2.3.0.zip\nENV build_deps 'curl git bash file sudo openssh gcc make build-base'\nENV dependencies 'openssl'\n\nENV PATH=/opt/conda/bin:$PATH \\\n    LANG=C.UTF-8 \\\n    MINICONDA=Miniconda3-latest-Linux-x86_64.sh\n# apk update\nRUN apk update\n\n# mecab\nRUN apk add --update --no-cache ${build_deps} \\\n  # Install dependencies\n  && apk add --update --no-cache ${dependencies} \\\n  # Install MeCab\n  && curl -SL -o mecab-${MECAB_VERSION}.tar.gz ${mecab_url} \\\n  && tar zxf mecab-${MECAB_VERSION}.tar.gz \\\n  && cd mecab-${MECAB_VERSION} \\\n  && ./configure --enable-utf8-only --with-charset=utf8 \\\n  && make \\\n  && make install \\\n  && cd \\\n  # Install IPA dic\n  && curl -SL -o mecab-ipadic-${IPADIC_VERSION}.tar.gz ${ipadic_url} \\\n  && tar zxf mecab-ipadic-${IPADIC_VERSION}.tar.gz \\\n  && cd mecab-ipadic-${IPADIC_VERSION} \\\n  && ./configure --with-charset=utf8 \\\n  && make \\\n  && make install \\\n  && cd \\\n  # Install Neologd\n  && git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \\\n  && mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \\\n  # Install jumandic\n  && curl -SL -o jumandic.tar.gz ${jumandic_url} \\\n  && tar zxf jumandic.tar.gz \\\n  && cd mecab-jumandic-7.0-20130310 \\\n  && ./configure --with-charset=utf8 \\\n  && make \\\n  && make install \\\n  # delete dictionary files\n  && cd \\\n  && rm -rf \\\n    mecab-${MECAB_VERSION}* \\\n    mecab-${IPADIC_VERSION}* \\\n    mecab-ipadic-neologd \\\n    mecab-jumandic-7.0-20130310\n\n# general\nRUN apk --no-cache add vim \\\nwget \\\nlsof \\\ncurl \\\nbash \\\nswig \\\ngcc \\\nbuild-base \\\nmake \\\npython-dev \\\npy-pip \\\njpeg-dev \\\nzlib-dev \\\ngit \\\nlinux-headers\nENV LIBRARY_PATH=/lib:/usr/lib\n\nENV PLANTUML_VERSION 1.2017.18\nENV PLANTUML_DOWNLOAD_URL https://sourceforge.net/projects/plantuml/files/plantuml.$PLANTUML_VERSION.jar/download\nENV PANDOC_VERSION 1.19.2.4\nENV PANDOC_DOWNLOAD_URL https://hackage.haskell.org/package/pandoc-$PANDOC_VERSION/pandoc-$PANDOC_VERSION.tar.gz\nENV PANDOC_ROOT /usr/local/pandoc\n\nENV PATH $PATH:$PANDOC_ROOT/bin\n\n# Create Pandoc build space\nRUN mkdir -p /pandoc-build\nWORKDIR /pandoc-build\n\n# Install/Build Packages\nRUN apk upgrade --update && \\\n    apk add --no-cache --virtual .build-deps $BUILD_DEPS && \\\n    apk add --no-cache --virtual .persistent-deps $PERSISTENT_DEPS && \\\n    curl -fsSL \"$PLANTUML_DOWNLOAD_URL\" -o /usr/local/plantuml.jar && \\\n    apk add --no-cache --virtual .edge-deps $EDGE_DEPS -X http://dl-cdn.alpinelinux.org/alpine/edge/community && \\\n    curl -fsSL \"$PANDOC_DOWNLOAD_URL\" | tar -xzf - && \\\n        ( cd pandoc-$PANDOC_VERSION && cabal update && cabal install --only-dependencies && \\\n        cabal configure --prefix=$PANDOC_ROOT && \\\n        cabal build && \\\n        cabal copy && \\\n        cd .. ) && \\\n    rm -Rf pandoc-$PANDOC_VERSION/ && \\\n    rm -Rf /root/.cabal/ /root/.ghc/ && \\\n    rmdir /pandoc-build && \\\n    set -x; \\\n    addgroup -g 82 -S www-data; \\\n    adduser -u 82 -D -S -G www-data www-data && \\\n    mkdir -p /var/docs && \\\n    apk del .build-deps .edge-deps\n\n# Juman\nRUN wget http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2 \\\n    && tar xvf juman-7.01.tar.bz2 \\\n    && cd juman-7.01 \\\n    && ./configure \\\n    && make \\\n    && make install \\\n    && cd .. \\\n    && rm -rf juman-7.01 \\\n    && rm juman-7.01.tar.bz2\n\n# Juman++\nRUN apk add --update --no-cache --virtual=build-deps \\\n    boost-dev g++ make \\\n    && wget -q http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz \\\n    && tar Jxfv jumanpp-1.02.tar.xz \\\n    && cd jumanpp-1.02/ \\\n    && ./configure \\\n    && make \\\n    && make install \\\n    && cd .. \\\n    && rm jumanpp-1.02.tar.xz \\\n    && rm -rf /var/cache/* \\\n    && apk del build-deps \\\n    && apk add --update --no-cache boost\n\n# kytea\nRUN wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz \\\n    && tar -xvf kytea-0.4.7.tar.gz \\\n    && cd kytea-0.4.7 \\\n    && ./configure \\\n    && make \\\n    && make install\n\n# Python\nRUN apk add --no-cache bash wget && \\\n    wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \\\n    bash $MINICONDA -b -p /opt/conda && \\\n    ln -s /opt/conda/bin/* /usr/local/bin/ && \\\n    rm -rf /root/.[acpw]* $MINICONDA /opt/conda/pkgs/*\n\nRUN conda config --add channels conda-forge --system\nRUN conda create -y -n p27 python=2.7\nRUN conda create -y -n p37 python=3.7\n\nRUN mkdir /code\nRUN mkdir /code/dev\nCOPY requirements_py2.txt /code/dev/requirements_py2.txt\nCOPY requirements_py3.txt /code/dev/requirements_py3.txt\n\nRUN source activate p27 && pip install -r /code/dev/requirements_py2.txt\nRUN source deactivate\n\nRUN source activate p37 && pip install -r /code/dev/requirements_py3.txt\nRUN source deactivate\n\nCMD [\"/bin/bash\"]"
  },
  {
    "path": "test/__init__.py",
    "content": "__author__ = 'kensuke-mi'\n"
  },
  {
    "path": "test/common/__init__.py",
    "content": ""
  },
  {
    "path": "test/common/test_server_handler.py",
    "content": "#! -*- coding: utf-8 -*-\n# test module\nfrom JapaneseTokenizer.common import sever_handler\n# client module\nimport six\nif six.PY2:\n    from JapaneseTokenizer.jumanpp_wrapper.__jumanpp_wrapper_python2 import JumanppWrapper\nelse:\n    from JapaneseTokenizer.jumanpp_wrapper.__jumanpp_wrapper_python3 import JumanppWrapper\n# else\nimport sys\nimport unittest\nimport os\nimport time\n\n__author__ = 'kensuke-mi'\n\n\nclass TestServerHandler(unittest.TestCase):\n    @classmethod\n    def setUpClass(cls):\n        if six.PY3:\n            cls.test_senetence = '紗倉 まな（さくらまな、1993年3月23日 - ）は、日本のAV女優。'\n        else:\n            cls.test_senetence = u'紗倉 まな（さくらまな、1993年3月23日 - ）は、日本のAV女優。'\n\n        cls.jumanpp_command = \"/usr/local/bin/jumanpp\"\n\n\n    def test_jumanpp_process_hanlder_normal(self):\n        \"\"\"It tests jumanpp process handler\"\"\"\n        # normal test #\n        jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command)\n        result_jumanpp_analysis = jumanpp_process_handler.query(input_string=self.test_senetence)\n        self.assertTrue(isinstance(result_jumanpp_analysis,six.text_type))\n        ## stop process ##\n        jumanpp_process_handler.stop_process()\n        ## delete instance ##\n        del jumanpp_process_handler\n\n    def test_jumanpp_process_handler_timeout_exception(self):\n        \"\"\"It tests the case which causes timeout exception\"\"\"\n        with self.assertRaises(Exception) as exc:\n            jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command,\n                                                                   timeout_second=1)\n            result_jumanpp_analysis = jumanpp_process_handler.query(input_string=self.test_senetence*100)\n        exception_message = exc.exception\n        jumanpp_process_handler.stop_process()\n\n    def test_jumanpp_process_handler_init_exception(self):\n        with self.assertRaises(Exception) as exc:\n            jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command='hoge',\n                                                                   timeout_second=1)\n        exception_message = exc.exception\n\n    def test_jumanpp_process_handler_huge_request(self):\n        \"\"\"It tests the case where a user sends too much request\"\"\"\n        input_huge_request = [self.test_senetence] * 100\n        jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command)\n        seq_result_jumanpp_analysis = [jumanpp_process_handler.query(input_string=sentence)\n                                       for sentence in input_huge_request]\n        self.assertTrue(isinstance(seq_result_jumanpp_analysis, list))\n\n\nif __name__ == '__main__':\n    unittest.main()"
  },
  {
    "path": "test/docker-compose-dev.yml",
    "content": "# 開発/test環境としてまとめてdocker環境を整えるためのcompose\nversion: '3'\nservices:\n  dev_env_py2:\n    build:\n      context: ./\n      dockerfile: Dockerfile-dev\n    volumes:\n    - ..:/codes/\n    stdin_open: true\n    tty: true\n    command: bash -c \"source /opt/conda/bin/activate p27 && pip install -r requirements_py2.txt\"\n  dev_env_py3:\n    build:\n      context: ./\n      dockerfile: Dockerfile\n    volumes:\n    - ..:/codes/\n    stdin_open: true\n    tty: true\n    command: bash -c \"source /opt/conda/bin/activate p37 && pip install -r requirements_py3.txt\""
  },
  {
    "path": "test/docker-compose.yml",
    "content": "# 開発/test環境としてまとめてdocker環境を整えるためのcompose\nversion: '3'\nservices:\n  test_env:\n    build:\n      context: ./\n      dockerfile: Dockerfile\n    volumes:\n      - ..:/codes/\n    stdin_open: true\n    tty: true\n    command: bash -c \"juman -S && source /opt/conda/bin/activate p37 && cd /codes/ && python setup.py test && source deactivate && echo 'Python3 test done' && source /opt/conda/bin/activate p27 && cd /codes/ && python setup.py test && echo 'Python2 test done'\""
  },
  {
    "path": "test/requirements_py2.txt",
    "content": "pypandoc\nfuture\nsix\njaconv>=0.2\npip>=8.1.0\npexpect\npyknp>=0.4.1\nmecab-python\ntyping\nneologdn\nkytea"
  },
  {
    "path": "test/requirements_py3.txt",
    "content": "pypandoc\nfuture\nsix\njaconv>=0.2\npip>=8.1.0\npexpect\npyknp\nmecab-python3\nneologdn\nkytea"
  },
  {
    "path": "test/resources/test/userdict.csv",
    "content": "さくらまな,-1,-1,-400,名詞,一般,*,*,*,*,さくらまな,*,*,*"
  },
  {
    "path": "test/test_all.py",
    "content": "__author__ = 'kensuke-mi'\n\nimport sys\nimport unittest\nimport six\npython_version = sys.version_info\n\n\ndef suite():\n    suite = unittest.TestSuite()\n    if six.PY3:\n        from .test_filter_python3 import TestFilter\n        from .test_mecab_wrapper_python3 import TestMecabWrapperPython3\n        from .test_kytea_wrapper_python3 import TestKyteaWrapperPython3\n        from .test_juman_wrapper_python3 import TestJumanWrapperPython3\n        suite.addTest(unittest.makeSuite(TestFilter))\n        suite.addTest(unittest.makeSuite(TestKyteaWrapperPython3))\n        suite.addTest(unittest.makeSuite(TestMecabWrapperPython3))\n        suite.addTest(unittest.makeSuite(TestJumanWrapperPython3))\n    elif six.PY2:\n        from .test_filter_python2 import TestFilter\n        from .test_mecab_wrapper_python2 import TestMecabWrapperPython2\n        from .test_juman_wrapper_python2 import TestJumanWrapperPython2\n        from .test_kytea_wrapper_python2 import TestKyteaWrapperPython2\n        suite.addTest(unittest.makeSuite(TestFilter))\n        suite.addTest(unittest.makeSuite(TestKyteaWrapperPython2))\n        suite.addTest(unittest.makeSuite(TestMecabWrapperPython2))\n        suite.addTest(unittest.makeSuite(TestJumanWrapperPython2))\n\n    return suite\n\n\ndef suite_with_jumanpp():\n    suite_obj = suite()\n    if six.PY3:\n        from .test_jumanpp_wrapper_python3 import TestJumanppWrapperPython3\n        suite_obj.addTest(suite_obj.addTest(unittest.makeSuite(TestJumanppWrapperPython3)))\n    elif six.PY2:\n        from .test_jumanpp_wrapper_python2 import TestJumanppWrapperPython2\n        suite_obj.addTest(suite_obj.addTest(unittest.makeSuite(TestJumanppWrapperPython2)))\n\n    return suite_obj"
  },
  {
    "path": "test/test_filter_python2.py",
    "content": "#! -*- coding: utf-8 -*-\nimport sys\nimport unittest\nfrom JapaneseTokenizer.mecab_wrapper import MecabWrapper\nfrom JapaneseTokenizer.datamodels import TokenizedSenetence, FilteredObject, TokenizedResult\nimport os\n__author__ = 'kensuke-mi'\n\n\nclass TestFilter(unittest.TestCase):\n    def setUp(self):\n        '''紗倉 まな（さくらまな、１９９３年３月２３日 - ）は、日本のAV女優みたいだ。'''\n        self.test_senetence = u'紗倉 まなは、日本のAV女優みたいで、うつくしい。\\nそこで、ぼくはその１枚のはなやかな作品を見たいと思った。'\n        self.stopword = ['AV']\n        self.pos_condition = [('名詞', '一般',), ('名詞', '固有名詞'), ('形容詞', '自立',), ('助詞', '格助詞', '引用')]\n        self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')\n\n    def test_filtering(self):\n        mecab_obj = MecabWrapper(dictType='ipadic')\n        tokenized_sentence = mecab_obj.tokenize(sentence=self.test_senetence,is_feature=True).\\\n            filter(pos_condition=self.pos_condition, stopwords=self.stopword)\n        assert isinstance(tokenized_sentence, TokenizedSenetence)\n\n        seq_except_pos = [(u'動詞',), (u'名詞', u'代名詞'), (u'名詞', u'接尾')]\n        seq_match_pos = [(u'名詞',), (u'名詞', u'固有名詞',), (u'形容詞',), (u'形容詞', u'自立'),(u'助詞', u'格助詞', u'引用')]\n\n        for token_obj in tokenized_sentence.tokenized_objects:\n            assert isinstance(token_obj, TokenizedResult)\n\n            pos_tuple = token_obj.tuple_pos\n            # 結果に入っているべきではない品詞 #\n            for except_pos in seq_except_pos:\n                self.assertTrue(not set(except_pos).issubset(set(pos_tuple)))\n            # 結果に入っているべき品詞 #\n            bool_any = any(set(match_pos).issubset(set(pos_tuple)) for match_pos in seq_match_pos)\n            self.assertTrue(bool_any)\n\n\nif __name__ == '__main__':\n    unittest.main()"
  },
  {
    "path": "test/test_filter_python3.py",
    "content": "#! -*- coding: utf-8 -*-\nimport sys\nimport unittest\nfrom JapaneseTokenizer.mecab_wrapper import MecabWrapper\nfrom JapaneseTokenizer.datamodels import TokenizedSenetence, FilteredObject, TokenizedResult\nimport os\n__author__ = 'kensuke-mi'\n\n\nclass TestFilter(unittest.TestCase):\n    def setUp(self):\n        '''紗倉 まな（さくらまな、１９９３年３月２３日 - ）は、日本のAV女優みたいだ。'''\n        self.test_senetence = '紗倉 まなは、日本のAV女優みたいで、うつくしい。そこで、ぼくはその１枚のはなやかな作品を見たいと思った。'\n        self.stopword = ['AV', '女優']\n        self.pos_condition = [('名詞', '一般',), ('名詞', '固有名詞'), ('形容詞', '自立',), ('助詞', '格助詞', '引用')]\n        self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')\n\n    def test_filtering(self):\n        mecab_obj = MecabWrapper(dictType='ipadic')\n        tokenized_sentence = mecab_obj.tokenize(sentence=self.test_senetence,is_feature=True).\\\n            filter(pos_condition=self.pos_condition, stopwords=self.stopword)\n        assert isinstance(tokenized_sentence, TokenizedSenetence)\n\n        seq_except_pos = [('動詞',), ('名詞', '代名詞'), ('名詞', '接尾')]\n        seq_match_pos = [('名詞',), ('名詞', '固有名詞',), ('形容詞',), ('形容詞', '自立'),('助詞', '格助詞', '引用')]\n\n        for token_obj in tokenized_sentence.tokenized_objects:\n            assert isinstance(token_obj, TokenizedResult)\n\n            pos_tuple = token_obj.tuple_pos\n            # 結果に入っているべきではない品詞 #\n            for except_pos in seq_except_pos:\n                self.assertTrue(not set(except_pos).issubset(set(pos_tuple)))\n            # 結果に入っているべき品詞 #\n            bool_any = any(set(match_pos).issubset(set(pos_tuple)) for match_pos in seq_match_pos)\n            self.assertTrue(bool_any)\n\n            # stopwordsのチェック\n            self.assertTrue(token_obj.word_stem not in self.stopword)\n\n\n\nif __name__ == '__main__':\n    unittest.main()"
  },
  {
    "path": "test/test_juman_wrapper_python2.py",
    "content": "#-*- encoding: utf-8 -*-\n# this test file does not work under pycharm\n# do your test with command line\nfrom __future__ import absolute_import\nfrom __future__ import division\nfrom future.utils import string_types, text_type\nfrom JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject\nfrom JapaneseTokenizer.juman_wrapper import JumanWrapper\nimport pyknp\nimport unittest\nimport sys\nimport codecs\nimport logging\nsys.stdin = codecs.getreader('utf_8')(sys.stdin)\nsys.stdout = codecs.getwriter('utf_8')(sys.stdout)\nlogger = logging.getLogger(__file__)\nlogger.level = logging.INFO\n\n\nclass TestJumanWrapperPython2(unittest.TestCase):\n    def setUp(self):\n        pass\n\n    def test_juman_wrapper(self):\n        try:\n            from pyknp import Juman\n\n            juman = Juman(command='juman', jumanpp=False)\n            result = juman.analysis(u\"これはペンです。\")\n            logger.debug(','.join(mrph.midasi for mrph in result))\n\n            for mrph in result.mrph_list():\n                assert isinstance(mrph, pyknp.Morpheme)\n                logger.debug(u\"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s\" \\\n                  % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))\n        except ImportError:\n            logger.debug('skip test_juman_wrapper')\n\n    def test_tokenize(self):\n        \"\"\"This test case checks juman_wrapper.tokenize\n        \"\"\"\n\n        logger.debug (u'Tokenize Test')\n        test_sentence = u\"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        juman_wrapper = JumanWrapper()\n        token_objects = juman_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=False,\n                                               is_feature=True)\n\n        assert isinstance(token_objects, TokenizedSenetence)\n        for t_obj in token_objects.tokenized_objects:\n            assert isinstance(t_obj, TokenizedResult)\n            logger.debug(u\"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}\".format(\n                t_obj.word_surface,\n                t_obj.word_stem,\n                ' '.join(t_obj.tuple_pos),\n                t_obj.misc_info\n            ))\n            assert isinstance(t_obj.word_surface, string_types)\n            assert isinstance(t_obj.word_stem, string_types)\n            assert isinstance(t_obj.tuple_pos, tuple)\n            assert isinstance(t_obj.misc_info, dict)\n\n        token_objects_list = token_objects.convert_list_object()\n        assert isinstance(token_objects_list, list)\n        logger.debug('-'*30)\n        for stem_posTuple in token_objects_list:\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, string_types)\n            assert isinstance(word_posTuple, tuple)\n\n            logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))\n\n    def test_filter_pos(self):\n        \"\"\"\n        \"\"\"\n        logger.debug (u'Filtering Test. POS condition is only 名詞')\n        test_sentence = u\"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        juman_wrapper = JumanWrapper()\n        token_objects = juman_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=False,\n                                               is_feature=True\n                                               )\n        pos_condition = [(u'名詞', )]\n        filtered_result = juman_wrapper.filter(\n            parsed_sentence=token_objects,\n            pos_condition=pos_condition\n        )\n\n        assert isinstance(filtered_result, FilteredObject)\n        for t_obj in filtered_result.tokenized_objects:\n            assert isinstance(t_obj, TokenizedResult)\n            logger.debug(u\"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}\".format(\n                t_obj.word_surface,\n                t_obj.word_stem,\n                ' '.join(t_obj.tuple_pos),\n                t_obj.misc_info\n            ))\n            assert isinstance(t_obj.word_surface, string_types)\n            assert isinstance(t_obj.word_stem, string_types)\n            assert isinstance(t_obj.tuple_pos, tuple)\n            assert isinstance(t_obj.misc_info, dict)\n\n            assert t_obj.tuple_pos[0] == u'名詞'\n\n        logger.debug('-'*30)\n        for stem_posTuple in filtered_result.convert_list_object():\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, string_types)\n            assert isinstance(word_posTuple, tuple)\n\n            logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))\n\n    def test_stopwords(self):\n        stopword = [u'ＡＶ', u'女優']\n        logger.debug (u'Stopwords Filtering Test. Stopwords is {}'.format(u','.join(stopword)))\n        test_sentence = u\"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        juman_wrapper = JumanWrapper()\n        token_objects = juman_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=False,\n                                               is_feature=True\n                                               )\n        filtered_result = juman_wrapper.filter(\n            parsed_sentence=token_objects,\n            stopwords=stopword\n        )\n\n        check_flag = True\n        for stem_posTuple in filtered_result.convert_list_object():\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, string_types)\n            assert isinstance(word_posTuple, tuple)\n\n            logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))\n            if word_stem in stopword: check_flag = False\n        assert check_flag\n\n    def test_juman_server_mode(self):\n        ### test with server mode ###\n\n        ### Attention: this method causes Error if you don't start JUMAN SERVER mode ###\n        test_sentence = u\"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        juman_wrapper = JumanWrapper(server='localhost', port=32000)\n        token_objects = juman_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=False,\n                                               is_feature=True)\n        self.assertTrue(isinstance(token_objects, TokenizedSenetence))\n\n\n        list_tokens = juman_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=True,\n                                               is_feature=True)\n        self.assertTrue(isinstance(list_tokens, list))\n\n\n\nif __name__ == '__main__':\n    unittest.main()"
  },
  {
    "path": "test/test_juman_wrapper_python3.py",
    "content": "#-*- encoding: utf-8 -*-\n# this test file does not work under pycharm\n# do your test with command line\nfrom pyknp import Juman\nfrom JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject\nfrom JapaneseTokenizer.juman_wrapper import JumanWrapper\nimport pyknp\nimport unittest\nimport os\nimport logging\nimport socket\nlogger = logging.getLogger(__file__)\nlogger.level = logging.INFO\n\n\nclass TestJumanWrapperPython3(unittest.TestCase):\n    def setUp(self):\n        # this is under MacOSX10\n        self.path_to_juman_command = '/usr/local/bin/juman'\n        if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'juman'\n\n    def test_juman_wrapper(self):\n        try:\n            juman = Juman(command=self.path_to_juman_command)\n            result = juman.analysis(\"これはペンです。\")\n            logger.debug(','.join(mrph.midasi for mrph in result))\n\n            for mrph in result.mrph_list():\n                assert isinstance(mrph, pyknp.Morpheme)\n                logger.debug(\"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s\" \\\n                      % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))\n        except ImportError:\n            print('skip test_juman_wrapper')\n\n    def test_tokenize(self):\n        \"\"\"This test case checks juman_wrapper.tokenize\n        \"\"\"\n        logger.debug('Tokenize Test')\n        test_sentence = \"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        juman_wrapper = JumanWrapper(command=self.path_to_juman_command)\n        token_objects = juman_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=False,\n                                               is_feature=True)\n\n        assert isinstance(token_objects, TokenizedSenetence)\n        for t_obj in token_objects.tokenized_objects:\n            assert isinstance(t_obj, TokenizedResult)\n            logger.debug(\"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}\".format(\n                t_obj.word_surface,\n                t_obj.word_stem,\n                ' '.join(t_obj.tuple_pos),\n                t_obj.misc_info\n            ))\n            assert isinstance(t_obj.word_surface, str)\n            assert isinstance(t_obj.word_stem, str)\n            assert isinstance(t_obj.tuple_pos, tuple)\n            assert isinstance(t_obj.misc_info, dict)\n\n        token_objects_list = token_objects.convert_list_object()\n        assert isinstance(token_objects_list, list)\n        logger.debug('-'*30)\n        for stem_posTuple in token_objects_list:\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, str)\n            assert isinstance(word_posTuple, tuple)\n\n            logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))\n\n    def test_filter_pos(self):\n        \"\"\"POS filteringのテスト\n        \"\"\"\n        logger.debug('Filtering Test. POS condition is only 名詞')\n        test_sentence = \"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        juman_wrapper = JumanWrapper(command=self.path_to_juman_command)\n        token_objects = juman_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=False,\n                                               is_feature=True)\n        pos_condition = [('名詞', )]\n        filtered_result = juman_wrapper.filter(\n            parsed_sentence=token_objects,\n            pos_condition=pos_condition\n        )\n\n        assert isinstance(filtered_result, FilteredObject)\n        for t_obj in filtered_result.tokenized_objects:\n            assert isinstance(t_obj, TokenizedResult)\n            logger.debug(\"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}\".format(\n                t_obj.word_surface,\n                t_obj.word_stem,\n                ' '.join(t_obj.tuple_pos),\n                t_obj.misc_info\n            ))\n            assert isinstance(t_obj.word_surface, str)\n            assert isinstance(t_obj.word_stem, str)\n            assert isinstance(t_obj.tuple_pos, tuple)\n            assert isinstance(t_obj.misc_info, dict)\n\n            assert t_obj.tuple_pos[0] == '名詞'\n\n        logger.debug('-'*30)\n        for stem_posTuple in filtered_result.convert_list_object():\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, str)\n            assert isinstance(word_posTuple, tuple)\n\n            logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))\n\n    def test_stopwords(self):\n        \"\"\"stopword除去のテスト\"\"\"\n        stopword = ['ＡＶ', '女優']\n        logger.debug ('Stopwords Filtering Test. Stopwords is {}'.format(','.join(stopword)))\n        test_sentence = \"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        juman_wrapper = JumanWrapper(command=self.path_to_juman_command)\n        token_objects = juman_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=False,\n                                               is_feature=True\n                                               )\n        filtered_result = juman_wrapper.filter(\n            parsed_sentence=token_objects,\n            stopwords=stopword\n        )\n\n        check_flag = True\n        for stem_posTuple in filtered_result.convert_list_object():\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, str)\n            assert isinstance(word_posTuple, tuple)\n\n            logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))\n            if word_stem in stopword: check_flag = False\n        assert check_flag\n\n    def test_juman_severmode(self):\n        \"\"\"* What you can do\n        - juman server modeのテストを実施する\n        \"\"\"\n        logger.debug('Tokenize test with server mode')\n        test_sentence = \"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        # check socket\n        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n        HOST = 'localhost'\n        PORT = 32000\n        try:\n            s.connect((HOST, PORT))\n            s.close()\n        except:\n            logger.warning(\"SKip server mode test because server is not working.\")\n        else:\n            juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT)\n            token_objects = juman_wrapper.tokenize(sentence=test_sentence,\n                                                   return_list=False,\n                                                   is_feature=True)\n            assert isinstance(token_objects, TokenizedSenetence)\n\n            test_sentence = \"ペルシア語（ペルシアご、ペルシア語: فارسی‌‎, پارسی‌; Fārsī, Pārsī）は、イランを中心とする中東地域で話される言語。\"\n            juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT)\n            list_token = juman_wrapper.tokenize(sentence=test_sentence,\n                                                   return_list=True,\n                                                   is_feature=True)\n            assert isinstance(list_token, list)\n\n\nif __name__ == '__main__':\n    unittest.main()"
  },
  {
    "path": "test/test_jumanpp_wrapper_python2.py",
    "content": "#-*- encoding: utf-8 -*-\n# this test file does not work under pycharm\n# do your test with command line\nfrom pyknp import Juman\nfrom JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject\nfrom JapaneseTokenizer.jumanpp_wrapper.jumanpp_wrapper import JumanppWrapper, JumanppClient\nfrom JapaneseTokenizer.common.sever_handler import JumanppHnadler\nimport pyknp\nimport socket\nimport unittest\nimport os\nimport logging\nlogger = logging.getLogger(__file__)\nlogger.level = logging.INFO\n\n\nclass TestJumanppWrapperPython2(unittest.TestCase):\n    def setUp(self):\n        # this is under MacOSX10\n        self.path_to_juman_command = '/usr/local/bin/jumanpp'\n        if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'jumanpp'\n\n    def test_JumanppClient(self):\n        test_sentence = u'外国人参政権を欲しい。'\n        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n        HOST = 'localhost'\n        PORT = 12000\n        try:\n            s.connect((HOST, PORT))\n            s.close()\n        except:\n            logger.warning(\"SKip server mode test because server is not working.\")\n        else:\n            client_obj = JumanppClient(hostname='localhost', port=12000)\n            res = client_obj.query(sentence=test_sentence, pattern=r'EOS')\n            del res\n\n    def test_jumanpp_servermode(self):\n        ### test with list return object ###\n        test_sentence = u'外国人参政権を欲しい。'\n        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n        HOST = 'localhost'\n        PORT = 12000\n        try:\n            s.connect((HOST, PORT))\n            s.close()\n        except:\n            logger.warning(\"SKip server mode test because server is not working.\")\n        else:\n            jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000)\n            list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)\n            assert isinstance(list_tokens, list)\n\n            ### test with TokenizedSenetence return object ###\n            tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)\n            assert isinstance(tokenized_obj, TokenizedSenetence)\n\n            ### test with TokenizedSenetence return object and filter by chain expression ###\n            pos_condtion = [('名詞', )]\n            filtered_res = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False).filter(pos_condition=pos_condtion)\n            assert isinstance(filtered_res, FilteredObject)\n            assert isinstance(filtered_res.convert_list_object(), list)\n\n    def test_jumanpp_servermode_stress(self):\n        ### test with severmode with much stress ###\n        test_sentence = u'外国人参政権を欲しい。'\n        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n        HOST = 'localhost'\n        PORT = 12000\n        try:\n            s.connect((HOST, PORT))\n            s.close()\n        except:\n            logger.warning(\"SKip server mode test because server is not working.\")\n        else:\n            jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000)\n            for i in range(0, 1000):\n                list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)\n                assert isinstance(list_tokens, list)\n                assert u'外国' in test_sentence\n            del jumanpp_tokenizer\n\n\n    def test_jumanpp_localmode_pyexpect(self):\n        test_sentence = u'外国人参政権を欲しい。'\n        jumanpp_tokenizer = JumanppWrapper(command=self.path_to_juman_command, is_use_pyknp=False)\n        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))\n        list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)\n        assert isinstance(list_tokens, list)\n\n        jumanpp_tokenizer = JumanppWrapper(command=self.path_to_juman_command, is_use_pyknp=False)\n        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))\n        tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)\n        assert isinstance(tokenized_obj, TokenizedSenetence)\n\n    def test_jumanpp_huge_amount_text(self):\n        \"\"\"pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動\"\"\"\n        logger.info('under testing of processing huge amount of text...')\n        seq_test_sentence = [u'外国人参政権を欲しい。'] * 500\n        jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)\n        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))\n        for i, test_s in enumerate(seq_test_sentence):\n            tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s)\n            self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence))\n            if not i == 0 and i % 100 == 0:\n                \"\"\"強制的にプロセスを殺して再起動\"\"\"\n                logger.info('It forces stop unix process.')\n                jumanpp_tokenizer.jumanpp_obj.restart_process()\n        else:\n            pass\n\n\nif __name__ == '__main__':\n    unittest.main()"
  },
  {
    "path": "test/test_jumanpp_wrapper_python3.py",
    "content": "#-*- encoding: utf-8 -*-\n# this test file does not work under pycharm\n# do your test with command line\nfrom pyknp import Juman\nfrom JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject\nfrom JapaneseTokenizer.jumanpp_wrapper.jumanpp_wrapper import JumanppWrapper, JumanppClient\nfrom JapaneseTokenizer.common.sever_handler import JumanppHnadler\nimport pyknp\nimport unittest\nimport os\nimport logging\nimport socket\nlogger = logging.getLogger(__file__)\nlogger.level = logging.INFO\n\n\nclass TestJumanppWrapperPython3(unittest.TestCase):\n    def setUp(self):\n        # this is under MacOSX10\n        self.path_to_juman_command = '/usr/local/bin/jumanpp'\n        if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'jumanpp'\n\n    def test_JumanppClient(self):\n        test_sentence = '外国人参政権を欲しい。'\n        # check socket\n        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n        HOST = 'localhost'\n        PORT = 12000\n        try:\n            s.connect((HOST, PORT))\n            s.close()\n        except:\n            logger.warning(\"SKip server mode test because server is not working.\")\n        else:\n            jumanpp_tokenizer = JumanppWrapper(server=HOST, port=PORT)\n            client_obj = JumanppClient(hostname='localhost', port=12000)\n            res = client_obj.query(sentence=test_sentence, pattern=rb'EOS')\n            del res\n\n    def test_jumanpp_servermode(self):\n        ### test with list return object ###\n        test_sentence = '外国人参政権を欲しい。'\n        # check socket\n        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n        HOST = 'localhost'\n        PORT = 12000\n\n        try:\n            s.connect((HOST, PORT))\n            s.close()\n        except:\n            logger.warning(msg='SKip server mode test because server is not working.')\n        else:\n            jumanpp_tokenizer = JumanppWrapper(server=HOST, port=PORT)\n            list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)\n            assert isinstance(list_tokens, list)\n\n            ### test with TokenizedSenetence return object ###\n            tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)\n            assert isinstance(tokenized_obj, TokenizedSenetence)\n\n            ### test with TokenizedSenetence return object and filter by chain expression ###\n            pos_condtion = [('名詞',)]\n            filtered_res = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False).filter(\n                pos_condition=pos_condtion)\n            assert isinstance(filtered_res, FilteredObject)\n            assert isinstance(filtered_res.convert_list_object(), list)\n\n    def test_jumanpp_servermode_stress(self):\n        ### test with severmode with much stress ###\n        test_sentence = '外国人参政権を欲しい。'\n        # check socket\n        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n        HOST = 'localhost'\n        PORT = 12000\n        try:\n            s.connect((HOST, PORT))\n            s.close()\n        except:\n            logger.warning(msg='SKip server mode test because server is not working.')\n        else:\n            jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000)\n            for i in range(0, 1000):\n                list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)\n                assert isinstance(list_tokens, list)\n                assert '外国' in test_sentence\n            del jumanpp_tokenizer\n\n    def test_jumanpp_localmode_pyexpect(self):\n        \"\"\"pexpectを使ったプロセス呼び出しのテスト\"\"\"\n        test_sentence = '外国人参政権を欲しい。'\n        jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)\n        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))\n        list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)\n        assert isinstance(list_tokens, list)\n\n        jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)\n        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))\n        tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)\n        assert isinstance(tokenized_obj, TokenizedSenetence)\n\n    def test_jumanpp_huge_amount_text(self):\n        \"\"\"pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動\"\"\"\n        logger.info('under testing of processing huge amount of text...')\n        seq_test_sentence = ['外国人参政権を欲しい。'] * 500\n        jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)\n        self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))\n        for i, test_s in enumerate(seq_test_sentence):\n            tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s)\n            self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence))\n            if not i == 0 and i % 100 == 0:\n                \"\"\"強制的にプロセスを殺して再起動\"\"\"\n                logger.info('It forces stop unix process.')\n                jumanpp_tokenizer.jumanpp_obj.restart_process()\n        else:\n            pass\n\n\nif __name__ == '__main__':\n    unittest.main()"
  },
  {
    "path": "test/test_kytea_wrapper_python2.py",
    "content": "# -*- coding: utf-8 -*-\nfrom JapaneseTokenizer.kytea_wrapper import KyteaWrapper\nfrom JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject\nimport unittest\n\nclass TestKyteaWrapperPython2(unittest.TestCase):\n\n    def setUp(self):\n        pass\n\n    def test_tokenization(self):\n        input_sentence = u\"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        kytea_wrapper = KyteaWrapper()\n        tokenized_result = kytea_wrapper.tokenize(\n            sentence=input_sentence,\n            normalize=True,\n            return_list=False,\n            is_feature=True\n        )\n        assert isinstance(tokenized_result, TokenizedSenetence)\n        for t_obj in tokenized_result.tokenized_objects:\n            assert isinstance(t_obj, TokenizedResult)\n\n        print('-'*30)\n        tokenized_result_list = tokenized_result.convert_list_object()\n        assert isinstance(tokenized_result_list, list)\n        for t_obj_tuple in tokenized_result_list:\n            assert isinstance(t_obj_tuple, tuple)\n\n    def test_filter_pos(self):\n        \"\"\"\n        \"\"\"\n        print (u'Filtering Test. POS condition is only 名詞')\n        test_sentence = u\"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        kytea_wrapper = KyteaWrapper()\n        tokenized_result = kytea_wrapper.tokenize(\n            sentence=test_sentence,\n            normalize=True,\n            return_list=False,\n            is_feature=True\n        )\n\n        pos_condition = [(u'名詞', )]\n        filtered_result = kytea_wrapper.filter(\n            parsed_sentence=tokenized_result,\n            pos_condition=pos_condition\n        )\n\n        assert isinstance(filtered_result, FilteredObject)\n        for t_obj in filtered_result.tokenized_objects:\n            assert isinstance(t_obj, TokenizedResult)\n            assert isinstance(t_obj.word_surface, unicode)\n            assert isinstance(t_obj.word_stem, unicode)\n            assert isinstance(t_obj.tuple_pos, tuple)\n            assert isinstance(t_obj.misc_info, dict)\n\n            assert t_obj.tuple_pos[0] == u'名詞'\n\n        print('-'*30)\n        for stem_posTuple in filtered_result.convert_list_object():\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, unicode)\n            assert isinstance(word_posTuple, tuple)\n\n    def test_stopwords(self):\n        stopword = [u'女優']\n        print (u'Stopwords Filtering Test. Stopwords is {}'.format(u','.join(stopword)))\n        test_sentence = u\"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        kytea_wrapper = KyteaWrapper()\n        token_objects = kytea_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=False,\n                                               is_feature=True\n                                               )\n        filtered_result = kytea_wrapper.filter(\n            parsed_sentence=token_objects,\n            stopwords=stopword\n        )\n\n        check_flag = True\n        print('-'*30)\n        for stem_posTuple in filtered_result.convert_list_object():\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, unicode)\n            assert isinstance(word_posTuple, tuple)\n            if word_stem in stopword:\n                check_flag = False\n        assert check_flag\n\n\nif __name__ == '__main__':\n    unittest.main()"
  },
  {
    "path": "test/test_kytea_wrapper_python3.py",
    "content": "# -*- coding: utf-8 -*-\nfrom JapaneseTokenizer.kytea_wrapper import KyteaWrapper\nfrom JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject\nimport unittest\n\nclass TestKyteaWrapperPython3(unittest.TestCase):\n\n    def setUp(self):\n        pass\n\n    def test_tokenization(self):\n        input_sentence = \"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        kytea_wrapper = KyteaWrapper()\n        tokenized_result = kytea_wrapper.tokenize(\n            sentence=input_sentence,\n            normalize=True,\n            return_list=False,\n            is_feature=True\n        )\n        assert isinstance(tokenized_result, TokenizedSenetence)\n        for t_obj in tokenized_result.tokenized_objects:\n            assert isinstance(t_obj, TokenizedResult)\n\n        #print('-'*30)\n        tokenized_result_list = tokenized_result.convert_list_object()\n        assert isinstance(tokenized_result_list, list)\n        for t_obj_tuple in tokenized_result_list:\n            assert isinstance(t_obj_tuple, tuple)\n\n    def test_filter_pos(self):\n        \"\"\"\n        \"\"\"\n        # 'Filtering Test. POS condition is only 名詞')\n        test_sentence = \"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        kytea_wrapper = KyteaWrapper()\n        tokenized_result = kytea_wrapper.tokenize(\n            sentence=test_sentence,\n            normalize=True,\n            return_list=False,\n            is_feature=True\n        )\n        pos_condition = [('名詞', )]\n        filtered_result = kytea_wrapper.filter(\n            parsed_sentence=tokenized_result,\n            pos_condition=pos_condition\n        )\n\n        assert isinstance(filtered_result, FilteredObject)\n        for t_obj in filtered_result.tokenized_objects:\n            assert isinstance(t_obj, TokenizedResult)\n            assert isinstance(t_obj.word_surface, str)\n            assert isinstance(t_obj.word_stem, str)\n            assert isinstance(t_obj.tuple_pos, tuple)\n            assert isinstance(t_obj.misc_info, dict)\n\n            assert t_obj.tuple_pos[0] == '名詞'\n\n        for stem_posTuple in filtered_result.convert_list_object():\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, str)\n            assert isinstance(word_posTuple, tuple)\n\n    def test_stopwords(self):\n        stopword = ['女優']\n        # ('Stopwords Filtering Test. Stopwords is {}'.format(','.join(stopword)))\n        test_sentence = \"紗倉 まな（さくら まな、1993年3月23日 - ）は、日本のAV女優。\"\n        kytea_wrapper = KyteaWrapper()\n        token_objects = kytea_wrapper.tokenize(sentence=test_sentence,\n                                               return_list=False,\n                                               is_feature=True\n                                               )\n        filtered_result = kytea_wrapper.filter(\n            parsed_sentence=token_objects,\n            stopwords=stopword\n        )\n\n        check_flag = True\n        for stem_posTuple in filtered_result.convert_list_object():\n            assert isinstance(stem_posTuple, tuple)\n            word_stem = stem_posTuple[0]\n            word_posTuple = stem_posTuple[1]\n            assert isinstance(word_stem, str)\n            assert isinstance(word_posTuple, tuple)\n            if word_stem in stopword: check_flag = False\n        assert check_flag\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "test/test_mecab_wrapper_python2.py",
    "content": "#! -*- coding: utf-8 -*-\n__author__ = 'kensuke-mi'\n\nimport sys\nimport unittest\nfrom JapaneseTokenizer.mecab_wrapper.mecab_wrapper import MecabWrapper\nfrom JapaneseTokenizer.datamodels import TokenizedSenetence\nfrom six import string_types\nimport os\npython_version = sys.version_info\n\n\nclass TestMecabWrapperPython2(unittest.TestCase):\n    def setUp(self):\n        self.test_senetence = u'紗倉 まな（さくらまな、1993年3月23日 - ）は、日本のAV女優。'\n        self.test_sentence2 = u'午前零時。午前3時。3時。'\n        self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')\n\n    def test_neologd_parse(self):\n        \"\"\"* Test case\n        - neologd辞書で正しく分割できることを確認する\n        \"\"\"\n        mecab_obj = MecabWrapper(dictType='neologd')\n        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence)\n        self.assertTrue(parsed_obj, TokenizedSenetence)\n        self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))\n        self.assertTrue(all(isinstance(mrph, string_types) for mrph in parsed_obj.convert_list_object()))\n\n        parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2)\n        self.assertTrue(parsed_obj, TokenizedSenetence)\n        self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))\n        self.assertTrue(all(isinstance(mrph, string_types) for mrph in parsed_obj.convert_list_object()))\n\n    def test_default_parse(self):\n        \"\"\"* Test case\n        - デフォルトの状態で動作を確認する\n        \"\"\"\n        dictType = \"ipadic\"\n        mecab_obj = MecabWrapper(dictType=dictType)\n        assert isinstance(mecab_obj, MecabWrapper)\n        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)\n        assert isinstance(parsed_obj, list)\n        if python_version >= (3, 0, 0):\n            for morph in parsed_obj:\n                assert isinstance(morph, str)\n        else:\n            for morph in parsed_obj:\n                assert isinstance(morph, string_types)\n\n    def test_init_userdict(self):\n        # test when user dictionary is called\n        mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)\n        assert isinstance(mecab_obj, MecabWrapper)\n        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)\n        is_ok = False\n        for morph in parsed_obj:\n            if u'さくらまな' == morph:\n                is_ok = True\n        else:\n            pass\n        assert is_ok\n\n    def test_parse_jumandic(self):\n        with self.assertRaises(Exception):\n            mecab_obj = MecabWrapper(dictType='jumandic')\n            assert isinstance(mecab_obj, MecabWrapper)\n\n    def test_init_alldict(self):\n        \"\"\"* Test case\n        - すべての辞書を利用した場合の動作を確認する\n        \"\"\"\n        with self.assertRaises(Exception):\n            mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict)\n            assert isinstance(mecab_obj, MecabWrapper)\n\n\nif __name__ == '__main__':\n    unittest.main()\n"
  },
  {
    "path": "test/test_mecab_wrapper_python3.py",
    "content": "#! -*- coding: utf-8 -*-\n__author__ = 'kensuke-mi'\n\nimport sys\nimport unittest\nfrom JapaneseTokenizer.mecab_wrapper.mecab_wrapper import MecabWrapper\nfrom JapaneseTokenizer.datamodels import TokenizedSenetence\nimport os\npython_version = sys.version_info\n\n\nclass TestMecabWrapperPython3(unittest.TestCase):\n    def setUp(self):\n        self.test_senetence = '紗倉 まな（さくらまな、1993年3月23日 - ）は、日本のAV女優。'\n        self.test_sentence2 = '午前零時。午前3時。3時。'\n        self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')\n\n    def test_neologd_parse(self):\n        # test using neologd dictionary\n        mecab_obj = MecabWrapper(dictType='neologd')\n        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence)\n        self.assertTrue(parsed_obj, TokenizedSenetence)\n        self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))\n        self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object()))\n\n        parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2)\n        self.assertTrue(parsed_obj, TokenizedSenetence)\n        self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))\n        self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object()))\n\n    def test_default_parse(self):\n        # test default status\n        dictType = \"ipadic\"\n        mecab_obj = MecabWrapper(dictType=dictType)\n        assert isinstance(mecab_obj, MecabWrapper)\n        \n        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)\n        assert isinstance(parsed_obj, list)\n        for morph in parsed_obj:\n            assert isinstance(morph, str)\n\n        parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2, return_list=True)\n        assert isinstance(parsed_obj, list)\n        for morph in parsed_obj:\n            assert isinstance(morph, str)\n\n    def test_parse_jumandic(self):\n        mecab_obj = MecabWrapper(dictType='jumandic')\n        assert isinstance(mecab_obj, MecabWrapper)\n\n        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)\n        assert isinstance(parsed_obj, TokenizedSenetence)\n        for tokenized_obj in parsed_obj.tokenized_objects:\n            if tokenized_obj.word_stem == '女優':\n                # ドメイン:文化・芸術 is special output only in Jumandic\n                assert 'ドメイン:文化・芸術' in tokenized_obj.analyzed_line\n\n    def test_parse_userdic(self):\n        pass\n\n    def test_parse_dictionary_path(self):\n        # put path to dictionary and parse sentence.\n        path_default_ipadic = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd'\n        if os.path.exists(path_default_ipadic):\n            mecab_obj = MecabWrapper(dictType=None, path_dictionary=path_default_ipadic)\n            assert mecab_obj._path_dictionary == path_default_ipadic\n            parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)\n            assert isinstance(parsed_obj, TokenizedSenetence)\n\n    def test_init_userdict(self):\n        # this test should be error response.\n        mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)\n        assert isinstance(mecab_obj, MecabWrapper)\n        parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)\n        assert isinstance(parsed_obj, TokenizedSenetence)\n        is_ok = False\n        for tokenized_obj in parsed_obj.tokenized_objects:\n            if tokenized_obj.word_stem == 'さくらまな':\n                is_ok = True\n        assert is_ok\n\n\nif __name__ == '__main__':\n    unittest.main()\n\n\n\n"
  },
  {
    "path": "travis-mecab-install.sh",
    "content": "#!/bin/bash\n# from https://gist.github.com/dtan4/351d031bec0c3d45cd8f\n# see also http://qiita.com/dtan4/items/c6a087666296fbd5fffb\n\nbase_dir=`pwd`\n\nwget -O mecab-0.996.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE'\ntar zxfv mecab-0.996.tar.gz\ncd mecab-0.996\n./configure --enable-utf8-only\nmake\nmake check\nsudo make install\nsudo ldconfig\n\ncd $base_dir\n\nwget -O mecab-ipadic-2.7.0-20070801.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM'\ntar zxfv mecab-ipadic-2.7.0-20070801.tar.gz\ncd mecab-ipadic-2.7.0-20070801\n./configure --with-charset=utf8\nmake\nsudo make install\nsudo ldconfig\n\nwget -O jumandic.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM'\ntar zxfv jumandic.tar.gz\ncd mecab-jumandic-7.0-20130310\n./configure --with-charset=utf8\nmake\nsudo make install\nsudo ldconfig\n\ncd $base_dir\nrm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801 mecab-jumandic-7.0-20130310"
  }
]