Repository: Kensuke-Mitsuzawa/JapaneseTokenizers Branch: master Commit: 3bdfb6be73de Files: 51 Total size: 140.3 KB Directory structure: gitextract_jdhxzz3y/ ├── .gitignore ├── .travis.yml ├── JapaneseTokenizer/ │ ├── __init__.py │ ├── common/ │ │ ├── __init__.py │ │ ├── juman_utils.py │ │ ├── sever_handler.py │ │ ├── text_preprocess.py │ │ └── timeout_handler.py │ ├── datamodels.py │ ├── init_logger.py │ ├── juman_wrapper/ │ │ ├── __init__.py │ │ └── juman_wrapper.py │ ├── jumanpp_wrapper/ │ │ ├── __init__.py │ │ └── jumanpp_wrapper.py │ ├── kytea_wrapper/ │ │ ├── __init__.py │ │ └── kytea_wrapper.py │ ├── mecab_wrapper/ │ │ ├── __init__.py │ │ └── mecab_wrapper.py │ └── object_models.py ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── README.md ├── examples/ │ ├── examples.py │ ├── userdict.csv │ └── userdict.dict ├── install_tokenizers.sh ├── setup.py ├── test/ │ ├── Dockerfile │ ├── Dockerfile-dev │ ├── __init__.py │ ├── common/ │ │ ├── __init__.py │ │ └── test_server_handler.py │ ├── docker-compose-dev.yml │ ├── docker-compose.yml │ ├── requirements_py2.txt │ ├── requirements_py3.txt │ ├── resources/ │ │ └── test/ │ │ ├── userdict.csv │ │ └── userdict.dict │ ├── test_all.py │ ├── test_filter_python2.py │ ├── test_filter_python3.py │ ├── test_juman_wrapper_python2.py │ ├── test_juman_wrapper_python3.py │ ├── test_jumanpp_wrapper_python2.py │ ├── test_jumanpp_wrapper_python3.py │ ├── test_kytea_wrapper_python2.py │ ├── test_kytea_wrapper_python3.py │ ├── test_mecab_wrapper_python2.py │ └── test_mecab_wrapper_python3.py └── travis-mecab-install.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea/ JapaneseTokenizer.egg-info/ build/ dist/ *eggs/ pyknp.egg-info/ .python-version *pyc morphogySplitters/ Mykytea-python/ .DS_Store *tox .cache/ python/ python2/ ================================================ FILE: .travis.yml ================================================ language: python python: - 2.7 - 3.5 addons: apt: packages: - git - make - curl - xz-utils - file - pandoc - libboost-all-dev - language-pack-ja-base - language-pack-ja - ibus-mozc - gcc-5 - g++-5 - build-essential - swig sources: - ubuntu-toolchain-r-test before_install: - sudo apt-get update -qq - sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1 - sudo update-locale LANG=ja_JP.UTF-8 LANGUAGE="ja_JP:ja" - mkdir ./target - export CC="gcc-5" - export CXX="g++-5" - export CFLAGS=-std=c++11 - export CXXFLAGS=-std=c++11 - sudo bash travis-mecab-install.sh - which mecab-config - sudo make install - git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git - cd mecab-ipadic-neologd && echo yes | sudo ./bin/install-mecab-ipadic-neologd && cd ../ - sudo juman -S install: - python --version - python setup.py install - pip install coveralls coverage nose script: - coverage run --source=JapaneseTokenizer setup.py test after_success: - coveralls notifications: email: recipients: - kensuke.mit@gmail.com on_success: always on_failure: always ================================================ FILE: JapaneseTokenizer/__init__.py ================================================ from JapaneseTokenizer.mecab_wrapper import MecabWrapper from JapaneseTokenizer.juman_wrapper import JumanWrapper from JapaneseTokenizer.datamodels import TokenizedSenetence from JapaneseTokenizer.datamodels import FilteredObject from JapaneseTokenizer.kytea_wrapper import KyteaWrapper from JapaneseTokenizer.jumanpp_wrapper import JumanppWrapper ================================================ FILE: JapaneseTokenizer/common/__init__.py ================================================ __author__ = 'kensuke-mi' ================================================ FILE: JapaneseTokenizer/common/juman_utils.py ================================================ from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence from typing import Tuple import pyknp from six import text_type """These functions are for utilization of Juman""" def extract_morphological_information(mrph_object, is_feature, is_surface): # type: (pyknp.Morpheme, bool, bool) -> TokenizedResult """This method extracts morphlogical information from token object. """ assert isinstance(mrph_object, pyknp.Morpheme) assert isinstance(is_feature, bool) assert isinstance(is_surface, bool) surface = mrph_object.midasi word_stem = mrph_object.genkei tuple_pos = (mrph_object.hinsi, mrph_object.bunrui) misc_info = { 'katuyou1': mrph_object.katuyou1, 'katuyou2': mrph_object.katuyou2, 'imis': mrph_object.imis, 'repname': mrph_object.repname } token_object = TokenizedResult( node_obj=None, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=surface, is_feature=is_feature, is_surface=is_surface, misc_info=misc_info ) return token_object def feature_parser(uni_feature, word_surface): # type: (text_type, text_type) -> Tuple[Tuple[text_type, text_type, text_type], text_type] """ Parse the POS feature output by Mecab :param uni_feature unicode: :return ( (pos1, pos2, pos3), word_stem ): """ list_feature_items = uni_feature.split(',') # if word has no feature at all if len(list_feature_items) == 1: return ('*'), ('*') pos1 = list_feature_items[0] pos2 = list_feature_items[1] pos3 = list_feature_items[2] tuple_pos = (pos1, pos2, pos3) # if without constraint(output is normal mecab dictionary like) if len(list_feature_items) == 9: word_stem = list_feature_items[6] # if with constraint(output format depends on Usedict.txt) else: word_stem = word_surface return tuple_pos, word_stem ================================================ FILE: JapaneseTokenizer/common/sever_handler.py ================================================ #! -*- coding: utf-8 -*- import subprocess from subprocess import Popen, PIPE, STDOUT import multiprocessing # socket object import socket # logger from JapaneseTokenizer import init_logger import logging # typing from typing import Union # else from six import text_type import six import pexpect import shutil import signal import os logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) class ProcessDownException(Exception): pass class UnixProcessHandler(object): def __init__(self, command, option=None, pattern='EOS', timeout_second=10): # type: (text_type,text_type,text_type,int)->None """* Get communication with unix process using pexpect module.""" self.command = command self.timeout_second = timeout_second self.pattern = pattern self.option = option self.launch_process(command) def __del__(self): if hasattr(self, "process_analyzer"): self.process_analyzer.kill(sig=9) def launch_process(self, command): # type: (Union[bytes,text_type])->None """* What you can do - It starts process and keep it. """ if not self.option is None: command_plus_option = self.command + " " + self.option else: command_plus_option = self.command if six.PY3: if shutil.which(command) is None: raise Exception("No command at {}".format(command)) else: self.process_analyzer = pexpect.spawnu(command_plus_option) self.process_id = self.process_analyzer.pid else: doc_command_string = "echo '' | {}".format(command) command_check = os.system(doc_command_string) if not command_check == 0: raise Exception("No command at {}".format(command)) else: self.process_analyzer = pexpect.spawnu(command_plus_option) self.process_id = self.process_analyzer.pid def restart_process(self): # type: ()->None if not self.option is None: command_plus_option = self.command + " " + self.option else: command_plus_option = self.command self.process_analyzer.kill(sig=9) self.process_analyzer = pexpect.spawnu(command_plus_option) self.process_id = self.process_analyzer.pid def stop_process(self): # type: ()->bool """* What you can do - You're able to stop the process which this instance has now. """ if hasattr(self, "process_analyzer"): self.process_analyzer.kill(sig=9) else: pass return True def __query(self, input_string): # type: (text_type)->text_type """* What you can do - It takes the result of Juman++ - This function monitors time which takes for getting the result. """ signal.signal(signal.SIGALRM, self.__notify_handler) signal.alarm(self.timeout_second) self.process_analyzer.sendline(input_string) buffer = "" while True: line_string = self.process_analyzer.readline() # type: text_type if line_string.strip() == input_string: """Skip if process returns the same input string""" continue elif line_string.strip() == self.pattern: buffer += line_string signal.alarm(0) return buffer else: buffer += line_string def __notify_handler(self, signum, frame): raise ProcessDownException("""It takes longer time than {time} seconds. You're able to try, 1. Change your setting of 'timeout_second' parameter 2. Run restart_process() method when the exception happens.""".format(**{"time": self.timeout_second})) def query(self, input_string): # type: (text_type)->text_type return self.__query(input_string=input_string) class JumanppHnadler(UnixProcessHandler): def __init__(self, jumanpp_command, option = None, pattern = 'EOS', timeout_second = 10): # type: (text_type,text_type,text_type,int)->None super(JumanppHnadler, self).__init__(command=jumanpp_command, option=option, pattern=pattern, timeout_second=timeout_second) def launch_jumanpp_process(self, command): # type: (text_type)->None return self.launch_process(command) ================================================ FILE: JapaneseTokenizer/common/text_preprocess.py ================================================ # -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals from __future__ import division from six import text_type import jaconv import six import re import unicodedata from JapaneseTokenizer import init_logger import logging logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) __author__ = 'kensuke-mi' if six.PY2: def u(str): return str.decode("utf-8") def b(str): return str pass else: # python3 def u(str): return str def b(str): return str.encode("utf-8") pass try: import neologdn is_neologdn_valid = True except: logger.warning("neologdn package is not installed yet. You could not call neologd dictionary.") is_neologdn_valid = False STRING_EXCEPTION = set([u('*')]) def denormalize_text(input_text): # type: (text_type)->text_type """* What you can do - It converts text into standard japanese writing way * Note - hankaku-katakana is to zenkaku-katakana - zenkaku-eisu is to hankaku-eisu """ if input_text in STRING_EXCEPTION: return input_text else: return jaconv.z2h(input_text, kana=False, ascii=True, digit=True) def normalize_text(input_text, dictionary_mode='ipadic', new_line_replaced='。', is_replace_eos=True, is_kana=True, is_ascii=True, is_digit=True): # type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type """* What you can do - It converts input-text into normalized-text which is good for tokenizer input. * Params - new_line_replaced: a string which replaces from \n string. """ if is_replace_eos: without_new_line = input_text.replace('\n', new_line_replaced) else: without_new_line = new_line_replaced if dictionary_mode=='neologd' and is_neologdn_valid: return neologdn.normalize(normalize_text_normal_ipadic(without_new_line)) elif dictionary_mode=='neologd' and is_neologdn_valid == False: raise Exception("You could not call neologd dictionary bacause you do NOT install the package neologdn.") else: return normalize_text_normal_ipadic(without_new_line, kana=is_kana, ascii=is_ascii, digit=is_digit) def normalize_text_normal_ipadic(input_text, kana=True, ascii=True, digit=True): # type: (text_type,bool,bool,bool)->text_type """ * All hankaku Katanaka is converted into Zenkaku Katakana * All hankaku English alphabet and numberc string are converted into Zenkaku one """ return jaconv.h2z(input_text, kana=kana, ascii=ascii, digit=digit) ================================================ FILE: JapaneseTokenizer/common/timeout_handler.py ================================================ #! -*- coding: utf-8 -*- from functools import wraps class TimeoutException(Exception): pass def handler_func(msg): raise TimeoutException() def on_timeout(limit, handler=handler_func, hint=None): """ 指定した実行時間に終了しなかった場合、handlerをhint/limitを引数にして呼び出します @on_timeout(limit=3600, handler=notify_func, hint=u'長い計算') def long_time_function(): """ def notify_handler(signum, frame): handler("'%s' is not finished in %d second(s)." % (hint, limit)) def __decorator(function): def __wrapper(*args, **kwargs): import signal signal.signal(signal.SIGALRM, notify_handler) signal.alarm(limit) result = function(*args, **kwargs) signal.alarm(0) return result return wraps(function)(__wrapper) return __decorator ================================================ FILE: JapaneseTokenizer/datamodels.py ================================================ #! -*- coding: utf-8 -*- # normalize module # from JapaneseTokenizer.common.text_preprocess import normalize_text, denormalize_text # datemodels # from MeCab import Node # typing # from typing import List, Union, Any, Tuple, Dict, Callable, Optional from future.utils import text_type, string_types import sys import six __author__ = 'kensuke-mi' python_version = sys.version_info def __is_sotpwords(token, stopwords): """This function filters out stopwords. If token is in stopwords list, return True; else return False """ if token in stopwords: return True else: return False def __is_valid_pos(pos_tuple, valid_pos): # type: (Tuple[text_type,...],List[Tuple[text_type,...]])->bool """This function checks token's pos is with in POS set that user specified. If token meets all conditions, Return True; else return False """ def is_valid_pos(valid_pos_tuple): # type: (Tuple[text_type,...])->bool length_valid_pos_tuple = len(valid_pos_tuple) if valid_pos_tuple == pos_tuple[:length_valid_pos_tuple]: return True else: return False seq_bool_flags = [is_valid_pos(valid_pos_tuple) for valid_pos_tuple in valid_pos] if True in set(seq_bool_flags): return True else: return False def filter_words(tokenized_obj, valid_pos, stopwords, check_field_name='stem'): # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type],text_type) -> FilteredObject """This function filter token that user don't want to take. Condition is stopword and pos. * Input - valid_pos - List of Tuple which has POS element to keep. - Keep in your mind, each tokenizer has different POS structure. >>> [('名詞', '固有名詞'), ('動詞', )] - stopwords - List of str, which you'd like to remove >>> ['残念', '今日'] """ assert isinstance(tokenized_obj, TokenizedSenetence) assert isinstance(valid_pos, list) assert isinstance(stopwords, list) filtered_tokens = [] for token_obj in tokenized_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) if check_field_name=='stem': res_stopwords = __is_sotpwords(token_obj.word_stem, stopwords) else: res_stopwords = __is_sotpwords(token_obj.word_surface, stopwords) res_pos_condition = __is_valid_pos(token_obj.tuple_pos, valid_pos) # case1: only pos filtering is ON if valid_pos != [] and stopwords == []: if res_pos_condition: filtered_tokens.append(token_obj) # case2: only stopwords filtering is ON if valid_pos == [] and stopwords != []: if res_stopwords is False: filtered_tokens.append(token_obj) # case3: both condition is ON if valid_pos != [] and stopwords != []: if res_stopwords is False and res_pos_condition: filtered_tokens.append(token_obj) filtered_object = FilteredObject( sentence=tokenized_obj.sentence, tokenized_objects=filtered_tokens, pos_condition=valid_pos, stopwords=stopwords ) return filtered_object class TokenizedResult(object): def __init__(self, node_obj, tuple_pos, word_stem, word_surface, is_feature=True, is_surface=False, misc_info=None, analyzed_line=None): # type: (Optional[Node], Tuple[text_type, ...], str, str, bool, bool, Optional[Dict[str, Any]], str)->None assert isinstance(node_obj, (Node, type(None))) assert isinstance(tuple_pos, (string_types, tuple)) assert isinstance(word_stem, (string_types)) assert isinstance(word_surface, text_type) assert isinstance(misc_info, (type(None), dict)) self.node_obj = node_obj self.word_stem = word_stem self.word_surface = word_surface self.is_surface = is_surface self.is_feature = is_feature self.misc_info = misc_info self.analyzed_line = analyzed_line if isinstance(tuple_pos, tuple): self.tuple_pos = tuple_pos elif isinstance(tuple_pos, string_types): self.tuple_pos = ('*', ) else: raise Exception('Error while parsing feature object. {}'.format(tuple_pos)) class TokenizedSenetence(object): def __init__(self, sentence, tokenized_objects, string_encoding='utf-8'): # type: (text_type, List[TokenizedResult], text_type)->None """* Parameters - sentence: sentence - tokenized_objects: list of TokenizedResult object - string_encoding: Encoding type of string type. This option is used only under python2.x """ assert isinstance(sentence, text_type) assert isinstance(tokenized_objects, list) self.sentence = sentence self.tokenized_objects = tokenized_objects self.string_encoding = string_encoding def __extend_token_object(self, token_object, is_denormalize=True, func_denormalizer=denormalize_text): # type: (TokenizedResult,bool,Callable[[str],str])->Tuple """This method creates dict object from token object. """ assert isinstance(token_object, TokenizedResult) if is_denormalize: if token_object.is_feature == True: if token_object.is_surface == True: token = (func_denormalizer(token_object.word_surface), token_object.tuple_pos) else: token = (func_denormalizer(token_object.word_stem), token_object.tuple_pos) else: if token_object.is_surface == True: token = func_denormalizer(token_object.word_surface) else: token = func_denormalizer(token_object.word_stem) else: if token_object.is_feature == True: if token_object.is_surface == True: token = (token_object.word_surface, token_object.tuple_pos) else: token = (token_object.word_stem, token_object.tuple_pos) else: if token_object.is_surface == True: token = token_object.word_surface else: token = token_object.word_stem return token def convert_list_object(self, is_denormalize=True, func_denormalizer=denormalize_text): # type: (bool,Callable[[str],str])->List[Union[str, Tuple[str,...]]] """* What you can do - You extract string object from TokenizedResult object * Args - is_denormalize: boolen object. True; it makes denormalize string - func_denormalizer: callable object. de-normalization function. """ sentence_in_list_obj = [ self.__extend_token_object(token_object,is_denormalize,func_denormalizer) for token_object in self.tokenized_objects ] return sentence_in_list_obj def __convert_string_type(self, p_c_tuple): # type: (Tuple[text_type,...])->Tuple[text_type] """* What you can do - it normalizes string types into str """ if not isinstance(p_c_tuple, tuple): raise Exception('Pos condition expects tuple of string. However = {}'.format(p_c_tuple)) converted = [text_type] * len(p_c_tuple) for i, pos_element in enumerate(p_c_tuple): if six.PY2 and isinstance(pos_element, str): """str into unicode if python2.x""" converted[i] = pos_element.decode(self.string_encoding) elif six.PY2 and isinstance(pos_element, text_type): converted[i] = pos_element elif six.PY3: converted[i] = pos_element else: raise Exception() return tuple(converted) def __check_pos_condition(self, pos_condistion): # type: (List[Tuple[text_type, ...]])->List[Tuple[text_type, ...]] """* What you can do - Check your pos condition - It converts character type into unicode if python version is 2.x """ assert isinstance(pos_condistion, list) return [self.__convert_string_type(p_c_tuple) for p_c_tuple in pos_condistion] def filter(self, pos_condition=None, stopwords=None, is_normalize=True, func_normalizer=normalize_text, check_field_name='stem'): # type: (List[Tuple[text_type,...]], List[text_type], bool, Callable[[text_type], text_type],text_type)->FilteredObject """* What you can do - It filters out token which does NOT meet the conditions (stopwords & part-of-speech tag) - Under python2.x, pos_condition & stopwords are converted into unicode type. * Parameters - pos_condition: list of part-of-speech(pos) condition. The pos condition is tuple is variable length. You can specify hierarchical structure of pos condition with variable tuple. The hierarchy of pos condition follows definition of dictionary. - For example, in mecab you can take words with 名詞 if ('名詞',) - For example, in mecab you can take words with 名詞-固有名詞 if ('名詞', '固有名詞') - stopwords: list of word which you would like to remove - is_normalize: Boolean flag for normalize stopwords. - func_normalizer: Function object for normalization. The function object must be the same one as when you use tokenize. - check_field_name: Put field name to check if stopword or NOT. Kytea does not have stem form of word, put 'surface' instead. * Example >>> pos_condition = [('名詞', '一般'), ('形容詞', '自立'), ('助詞', '格助詞', '一般')] >>> stopwords = ['これ', 'それ'] """ assert isinstance(pos_condition, (type(None), list)) assert isinstance(stopwords, (type(None), list)) if stopwords is None: s_words = [] elif six.PY2 and all((isinstance(s, str) for s in stopwords)): """under python2.x, from str into unicode""" if is_normalize: s_words = [func_normalizer(s.decode(self.string_encoding)) for s in stopwords] else: s_words = [s.decode(self.string_encoding) for s in stopwords] else: if is_normalize: s_words = [func_normalizer(s) for s in stopwords] else: s_words = stopwords if pos_condition is None: p_condition = [] else: p_condition = self.__check_pos_condition(pos_condition) filtered_object = filter_words( tokenized_obj=self, valid_pos=p_condition, stopwords=s_words, check_field_name=check_field_name ) assert isinstance(filtered_object, FilteredObject) return filtered_object class FilteredObject(TokenizedSenetence): def __init__(self, sentence, tokenized_objects, pos_condition, stopwords): # type: (str, List[TokenizedResult], List[str, ...], List[str])->None super(FilteredObject, self).__init__( sentence=sentence, tokenized_objects=tokenized_objects ) self.pos_condition=pos_condition self.stopwords=stopwords ================================================ FILE: JapaneseTokenizer/init_logger.py ================================================ LOGGER_NAME = 'JapaneseTokenizer' import logging import sys from logging import getLogger, Formatter, Logger, StreamHandler # Formatter custmoFormatter = Formatter( fmt='[%(asctime)s]%(levelname)s - %(filename)s#%(funcName)s:%(lineno)d: %(message)s', datefmt='Y/%m/%d %H:%M:%S' ) # StreamHandler STREAM_LEVEL = logging.DEBUG STREAM_FORMATTER = custmoFormatter STREAM = sys.stderr st_handler = StreamHandler(stream=STREAM) st_handler.setLevel(STREAM_LEVEL) st_handler.setFormatter(STREAM_FORMATTER) def init_logger(logger): # type: (logging.Logger) -> logging.Logger logger.addHandler(st_handler) logger.propagate = False return logger ================================================ FILE: JapaneseTokenizer/juman_wrapper/__init__.py ================================================ __author__ = 'kensuke-mi' from .juman_wrapper import JumanWrapper ================================================ FILE: JapaneseTokenizer/juman_wrapper/juman_wrapper.py ================================================ # -*- coding: utf-8 -*- # package module from JapaneseTokenizer.object_models import WrapperBase from JapaneseTokenizer.common import text_preprocess from JapaneseTokenizer.datamodels import FilteredObject, TokenizedResult, TokenizedSenetence from JapaneseTokenizer import init_logger from JapaneseTokenizer.common.sever_handler import JumanppHnadler # else from typing import List, Union, Callable, Tuple from six import text_type from pyknp import MList import logging import sys import os import six logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) __author__ = 'kensuke-mi' python_version = sys.version_info try: import pyknp except ImportError: logger.warning(msg='pyknp is not ready to use. Install first if you would like to use pyknp wrapper.') if six.PY3: import socket import re class MonkeyPatchSocket(object): """* Class for overwriting pyknp.Socket because it is only for python2.x""" def __init__(self, hostname, port, option=None): try: self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.sock.connect((hostname, port)) except: raise if option is not None: self.sock.send(option) data = b"" while b"OK" not in data: # while isinstance(data, bytes) and b"OK" not in data: data = self.sock.recv(1024) def __del__(self): if self.sock: self.sock.close() def query(self, sentence, pattern): # type: (str,str)->str assert(isinstance(sentence, six.text_type)) sentence_bytes = sentence.encode('utf-8').strip() pattern_bytes = pattern.encode('utf-8') self.sock.sendall(sentence_bytes + b"\n") data = self.sock.recv(1024) assert isinstance(data, bytes) recv = data while not re.search(pattern_bytes, recv): data = self.sock.recv(1024) recv = recv + data return recv.strip().decode('utf-8') class JumanWrapper(WrapperBase): def __init__(self, command='juman', server=None, port=32000, timeout=30, rcfile=None, option='-e2 -B', pattern='EOS', is_use_pyknp=False, **args): # type: (text_type, text_type, int, int, text_type, Union[bytes, text_type], Union[bytes, text_type], bool, **str)->None """* Class to call Juman tokenizer """ self.timeout = timeout self.pattern = pattern self.option = option self.command = command if not rcfile is None and not os.path.exists(rcfile): raise FileExistsError('rcfile does not exist at {}'.format(rcfile)) if not server is None: # It converts from str into bytes only for sever mode # self.option = self.option.encode('utf-8') # type: Union[str,bytes] self.pattern = self.pattern.encode('utf-8') # type: Union[str,bytes] else: pass # check os # if os.name == 'nt': if not is_use_pyknp: logger.warning(msg='It forces is_use_pyknp = True on Windows.') else: pass self.is_use_pyknp = True else: pass if server is not None: # use server mode # self.juman = pyknp.Juman(command=command, server=server, port=port, timeout=self.timeout, rcfile=rcfile, option=option, pattern=pattern, jumanpp=False, **args) if six.PY3: # It overwrites juman_lines() method # self.juman.juman_lines = self.__monkey_patch_juman_lines elif is_use_pyknp and server is None: # use unix process with pyknp self.juman = pyknp.Juman(command=command, server=server, port=port, timeout=self.timeout, rcfile=rcfile, option=option, pattern=pattern, jumanpp=False, **args) else: # use unix process with pexpect(RECOMMENDED) # self.juman = JumanppHnadler(jumanpp_command=command, option=self.option, pattern=self.pattern, timeout_second=self.timeout) def __del__(self): if hasattr(self, "juman"): if isinstance(self.juman, JumanppHnadler): self.juman.stop_process() def __monkey_patch_juman_lines(self, input_str): # type: (text_type)->text_type """* What you can do - It overwrites juman_line() method because this method causes TypeError in python3 """ assert isinstance(self.juman, pyknp.Juman) if not self.juman.socket and not self.juman.subprocess: if self.juman.server is not None: self.juman.socket = MonkeyPatchSocket(self.juman.server, self.juman.port, b"RUN -e2\n") else: command = "%s %s" % (self.juman.command, self.juman.option) if self.juman.rcfile: command += " -r %s" % self.juman.rcfile self.juman.subprocess = pyknp.Subprocess(command) if self.juman.socket: return self.juman.socket.query(input_str, pattern=self.juman.pattern) return self.juman.subprocess.query(input_str, pattern=self.juman.pattern) def __extract_morphological_information(self, mrph_object, is_feature, is_surface): """This method extracts morphlogical information from token object. """ assert isinstance(mrph_object, pyknp.Morpheme) assert isinstance(is_feature, bool) assert isinstance(is_surface, bool) surface = mrph_object.midasi word_stem = mrph_object.genkei tuple_pos = (mrph_object.hinsi, mrph_object.bunrui) misc_info = { 'katuyou1': mrph_object.katuyou1, 'katuyou2': mrph_object.katuyou2, 'imis': mrph_object.imis, 'repname': mrph_object.repname } token_object = TokenizedResult( node_obj=None, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=surface, is_feature=is_feature, is_surface=is_surface, misc_info=misc_info ) return token_object def call_juman_interface(self, input_str): # type: (text_type)->MList if isinstance(self.juman, pyknp.Juman): result = self.juman.analysis(input_str) return result elif isinstance(self.juman, JumanppHnadler): try: result_analysis = self.juman.query(input_str) except UnicodeDecodeError: logger.warning(msg="Process is down by some reason. It restarts process automatically.") self.juman.restart_process() result_analysis = self.juman.query(input_string=input_str) return MList(result_analysis) else: raise Exception('Not defined.') def tokenize(self, sentence, normalize=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=text_preprocess.normalize_text): # type: (text_preprocess, bool, bool, bool, bool, Callable[[str], text_type])->Union[List[text_type], TokenizedSenetence] """This method returns tokenized result. If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS. If return_list==False, this method returns TokenizedSenetence object. """ assert isinstance(normalize, bool) assert isinstance(sentence, text_type) normalized_sentence = func_normalizer(sentence) result = self.call_juman_interface(normalized_sentence) token_objects = [ self.__extract_morphological_information( mrph_object=morph_object, is_surface=is_surface, is_feature=is_feature ) for morph_object in result] if return_list: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects ) return tokenized_objects.convert_list_object() else: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects def filter(self, parsed_sentence, pos_condition=None, stopwords=None): # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type])->FilteredObject assert isinstance(parsed_sentence, TokenizedSenetence) assert isinstance(pos_condition, (type(None), list)) assert isinstance(stopwords, (type(None), list)) return parsed_sentence.filter(pos_condition, stopwords) ================================================ FILE: JapaneseTokenizer/jumanpp_wrapper/__init__.py ================================================ from .jumanpp_wrapper import JumanppWrapper ================================================ FILE: JapaneseTokenizer/jumanpp_wrapper/jumanpp_wrapper.py ================================================ #! -*- coding: utf-8 -*- from pyknp import Juman from pyknp import MList # modules from JapaneseTokenizer.object_models import WrapperBase from JapaneseTokenizer.common import text_preprocess, juman_utils from JapaneseTokenizer.common.sever_handler import JumanppHnadler, ProcessDownException from JapaneseTokenizer import init_logger from JapaneseTokenizer.datamodels import FilteredObject, TokenizedSenetence from typing import List, Dict, Tuple, Union, TypeVar, Any, Callable # timeout from JapaneseTokenizer.common.timeout_handler import on_timeout from six import text_type import logging import sys import socket import six import re import os __author__ = 'kensuke-mi' logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) python_version = sys.version_info ContentsTypes = TypeVar('T') try: import pyknp except ImportError: logger.warning(msg='pyknp is not ready to use. Install first if you would like to use pyknp wrapper.') if six.PY2: ConnectionRefusedError = Exception class JumanppClient(object): """Class for receiving data as client""" def __init__(self, hostname, port, timeout=50, option=None): # type: (text_type, int, int, Dict[text_type,Any])->None try: self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if isinstance(port, text_type): port = int(port) self.sock.connect((hostname, port)) except: raise Exception("There is no jumanpp server hostname={}, port={}".format(hostname, port)) if option is not None: self.sock.send(option) data = '' self.sock.settimeout(timeout) def __del__(self): if self.sock: self.sock.close() def query(self, sentence, pattern): # type: (text_type, bytes) -> text_type assert (isinstance(sentence, six.text_type)) data = '' self.sock.sendall("%s\n" % sentence.encode('utf-8').strip()) data = self.sock.recv(1024) assert isinstance(data, bytes) recv = data while not re.search(pattern, recv): data = self.sock.recv(1024) recv = "%s%s" % (recv, data) return recv.strip().decode('utf-8') else: class JumanppClient(object): """Class for receiving data as client""" def __init__(self, hostname, port, timeout=50, option=None): # type: (text_type, int, int, Dict[text_type,Any])->None try: self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if isinstance(port, str): port = int(port) self.sock.connect((hostname, port)) except ConnectionRefusedError: raise Exception("There is no jumanpp server hostname={}, port={}".format(hostname, port)) except: raise if option is not None: self.sock.send(option) data = b"" self.sock.settimeout(timeout) def __del__(self): if self.sock: self.sock.close() def query(self, sentence, pattern): # type: (str, Union[str,bytes]) -> str assert (isinstance(sentence, six.text_type)) if isinstance(pattern, str): pattern = pattern.encode('utf-8') self.sock.sendall(b"%s\n" % sentence.encode('utf-8').strip()) data = self.sock.recv(1024) assert isinstance(data, bytes) recv = data while not re.search(pattern, recv): data = self.sock.recv(1024) recv = b"%s%s" % (recv, data) return recv.strip().decode('utf-8') class JumanppWrapper(WrapperBase): """Class for Juman++""" def __init__(self, command='jumanpp', timeout=30, pattern=r'EOS', server=None, port=12000, is_use_pyknp = False, ** args): # type: (text_type,int,text_type,text_type,bool) """* What you can do - You can select backend process of jumanpp. - jumanpp-pexpect: It calls jumanpp on your local machine. It keeps jumanpp process running. - jumanpp-pyknp: It calls jumanpp on your local machine. It launches jumanpp process everytime you call. Thus, this is slower than jumanpp-pexpect - jumanpp-server: It calls jumannpp on somewhere else. Keep mind, you have jumanpp sever process somewhere. * Parameters - timeout: Time to wait from jumanpp process. - is_use_pyknp: bool flag to decide if you use pyknp as backend process. If True; you use pyknp. False; you use pexpect. pexpect is much faster than you use pyknp. You can not use pexpect if you're using it on Windowns - server: hostname where jumanpp is running - port: port number where jumanpp is running """ self.eos_pattern = pattern self.is_use_pyknp = is_use_pyknp if six.PY2: self.dummy_text = 'これはダミーテキストです'.decode('utf-8') elif six.PY3: self.dummy_text = 'これはダミーテキストです' if not server is None: pattern = pattern.encode('utf-8') else: pass if os.name == 'nt': """It forces to use pyknp if it runs on Windows.""" if not self.is_use_pyknp: logger.warning(msg="You're not able to use pexpect in Windows. It forced to set is_use_pyknp = True") else: pass self.is_use_pyknp = True else: pass if server is None and self.is_use_pyknp: # jumanpp-pexpect # logger.debug('jumanpp wrapper is initialized with pyknp package') self.jumanpp_obj = Juman( command=command, timeout=timeout, pattern=pattern, jumanpp=True, **args) elif server is None: # jumanpp-pexpect # logger.debug('jumanpp wrapper is initialized with pexpect unix handler') self.jumanpp_obj = JumanppHnadler(jumanpp_command=command, timeout_second=timeout, pattern=pattern) # type: JumanppHnadler # put dummy sentence to avoid exception just after command initialization # res = self.jumanpp_obj.query(self.dummy_text) else: # jumanpp-server # self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout) def __del__(self): if hasattr(self, "jumanpp_obj"): if isinstance(self.jumanpp_obj, JumanppClient): self.jumanpp_obj.sock.close() elif isinstance(self.jumanpp_obj, JumanppHnadler): self.jumanpp_obj.stop_process() else: del self.jumanpp_obj else: pass def call_juman_interface(self, input_str): # type: (text_type) -> MList """* What you can do - You call Juman tokenizer interface. * Output - pyknp.MList """ if isinstance(self.jumanpp_obj, Juman): ml_token_object = self.jumanpp_obj.analysis(input_str=input_str) elif isinstance(self.jumanpp_obj, JumanppHnadler): try: result_token = self.jumanpp_obj.query(input_string=input_str) except ProcessDownException: """Unix process is down by any reason.""" logger.warning("Re-starting unix process because it takes longer time than {} seconds...".format(self.jumanpp_obj.timeout_second)) self.jumanpp_obj.restart_process() self.jumanpp_obj.query(self.dummy_text) result_token = self.jumanpp_obj.query(input_string=input_str) ml_token_object = MList(result_token) except UnicodeDecodeError: logger.warning(msg="Process is down by some reason. It restarts process automatically.") self.jumanpp_obj.restart_process() self.jumanpp_obj.query(self.dummy_text) result_token = self.jumanpp_obj.query(input_string=input_str) ml_token_object = MList(result_token) else: ml_token_object = MList(result_token) elif isinstance(self.jumanpp_obj, JumanppClient): server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern) ml_token_object = MList(server_response) else: raise Exception('Not defined') return ml_token_object @on_timeout(limit=60) def tokenize(self, sentence, normalize=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=text_preprocess.normalize_text): # type: (text_type, bool, bool, bool, bool, Callable[[text_type], text_type]) -> Union[TokenizedSenetence, List[text_type]] """* What you can do - """ if normalize: normalized_sentence = func_normalizer(sentence) else: normalized_sentence = sentence ml_token_object = self.call_juman_interface(normalized_sentence) token_objects = [ juman_utils.extract_morphological_information( mrph_object=morph_object, is_surface=is_surface, is_feature=is_feature ) for morph_object in ml_token_object] if return_list: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects.convert_list_object() else: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects def filter(self, parsed_sentence, pos_condition=None, stopwords=None): # type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type]) -> FilteredObject assert isinstance(parsed_sentence, TokenizedSenetence) assert isinstance(pos_condition, (type(None), list)) assert isinstance(stopwords, (type(None), list)) return parsed_sentence.filter(pos_condition, stopwords) ================================================ FILE: JapaneseTokenizer/kytea_wrapper/__init__.py ================================================ __author__ = 'kensuke-mi' from .kytea_wrapper import KyteaWrapper ================================================ FILE: JapaneseTokenizer/kytea_wrapper/kytea_wrapper.py ================================================ # -*- coding: utf-8 -*- from JapaneseTokenizer.object_models import WrapperBase from JapaneseTokenizer.common import text_preprocess from JapaneseTokenizer.datamodels import FilteredObject, TokenizedResult, TokenizedSenetence from JapaneseTokenizer import init_logger from typing import List, Tuple, Any, Union, Callable from six import text_type, string_types import logging import sys import six logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) python_version = sys.version_info try: import Mykytea except ImportError: logger.warning(msg='Mykytea is not ready to use yet. Install first if you would like to use kytea wrapper.') __author__ = 'kensuke-mi' class KyteaWrapper(WrapperBase): def __init__(self, option_string='-deftag UNKNOWN!!'): # type: (string_types)->None # option string is argument of Kytea. assert isinstance(option_string, string_types) self.kytea = Mykytea.Mykytea(option_string) def __list_tags(self, t): def convert(t2): return (t2[0], t2[1]) return [(word.surface, [[convert(t2) for t2 in t1] for t1 in word.tag]) for word in t] def __check_char_set(self, input_char): # type: (text_type) -> text_type if six.PY2 and isinstance(input_char, str): return input_char.decode('utf-8') elif isinstance(input_char, text_type): return input_char else: raise Exception('nor unicode, str') def __extract_morphological_information(self, kytea_tags_tuple, is_feature): # type: (Tuple[text_type,List[Any]], bool) -> TokenizedResult """This method extracts morphlogical information from token object. """ assert isinstance(kytea_tags_tuple, tuple) assert isinstance(is_feature, bool) surface = self.__check_char_set(kytea_tags_tuple[0]) # NOTE: kytea does NOT show word stem. Put blank string instead. if six.PY2: word_stem = ''.decode('utf-8') else: word_stem = '' pos_tuple = kytea_tags_tuple[1][0] pos = self.__check_char_set(pos_tuple[0][0]) pos_score = float(pos_tuple[0][1]) yomi_tuple = kytea_tags_tuple[1][1] yomi = self.__check_char_set(yomi_tuple[0][0]) yomi_score = float(yomi_tuple[0][1]) tuple_pos = (pos, ) misc_info = { 'pos_score': pos_score, 'pos': pos, 'yomi': yomi, 'yomi_score': yomi_score } token_object = TokenizedResult( node_obj=None, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=surface, is_feature=is_feature, is_surface=True, misc_info=misc_info ) return token_object def call_kytea_tokenize_api(self, sentence): """ """ result = self.kytea.getTagsToString(sentence) assert isinstance(result, text_type) return result def tokenize(self, sentence, normalize=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=text_preprocess.normalize_text): # type: (text_type, bool, bool, bool, bool, Callable[[str],str]) -> Union[List[str], TokenizedSenetence] """This method returns tokenized result. If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS. If return_list==False, this method returns TokenizedSenetence object. """ assert isinstance(normalize, bool) assert isinstance(sentence, text_type) normalized_sentence = func_normalizer(sentence) if six.PY2: normalized_sentence = normalized_sentence.encode('utf-8') result = self.__list_tags(self.kytea.getTags(normalized_sentence)) token_objects = [ self.__extract_morphological_information( kytea_tags_tuple=kytea_tags, is_feature=is_feature ) for kytea_tags in result] if return_list: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects ) return tokenized_objects.convert_list_object() else: tokenized_objects = TokenizedSenetence( sentence=sentence, tokenized_objects=token_objects) return tokenized_objects def filter(self, parsed_sentence, pos_condition=None, stopwords=None): assert isinstance(parsed_sentence, TokenizedSenetence) assert isinstance(pos_condition, (type(None), list)) assert isinstance(stopwords, (type(None), list)) return parsed_sentence.filter(pos_condition, stopwords, check_field_name='surface') ================================================ FILE: JapaneseTokenizer/mecab_wrapper/__init__.py ================================================ __author__ = 'kensuke-mi' from .mecab_wrapper import MecabWrapper ================================================ FILE: JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py ================================================ #! -*- coding: utf-8 -*- # core module from JapaneseTokenizer.object_models import WrapperBase from JapaneseTokenizer.common.text_preprocess import normalize_text from JapaneseTokenizer import init_logger from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject import MeCab # else import sys import os import logging import subprocess import six from six import text_type # typing from typing import List, Tuple, Union, TypeVar, Callable ContentsTypes = TypeVar('T') __author__ = 'kensuke-mi' logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) python_version = sys.version_info try: import neologdn is_neologdn_valid = True except: logger.warning("neologdn package is not installed yet. You could not call neologd dictionary.") is_neologdn_valid = False class MecabWrapper(WrapperBase): def __init__(self, dictType, pathUserDictCsv=None, path_mecab_config=None, path_dictionary=None, string_encoding='utf-8'): # type: (text_type, text_type, text_type, text_type, text_type)->None """ :param dictType: a dictionary type called by mecab :param pathUserDictCsv: path to your original dictionary file :param path_mecab_config: path to 'mecab_config' command. It's automatically detected if not give :param path_dictionary: path to a dictionary which you want to use. If not given, it's automatically detected :param string_encoding: encoding option to parse command line result. This is mainly used for python2.x """ self.string_encoding = string_encoding self._dictType = dictType self._pathUserDictCsv = pathUserDictCsv self._path_dictionary = path_dictionary if path_mecab_config is None: self._path_mecab_config = self.__get_path_to_mecab_config() else: self._path_mecab_config = path_mecab_config if self._path_dictionary is not None: assert os.path.exists(self._path_dictionary), 'Path dictionary is NOT exist.' self._mecab_dictionary_path = None else: self._mecab_dictionary_path = self.__check_mecab_dict_path() logger.info("mecab dictionary path is detected under {}".format(self._mecab_dictionary_path)) self.mecabObj = self.__CallMecab() assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", "jumandic", "unidic", None], \ 'Dictionary Type Error. Your dict = {} is NOT available.' if dictType == 'all': logger.error('dictionary type "all" is deprecated from version1.6') raise Exception('dictionary type "all" is deprecated from version1.6') if dictType == 'user': logger.error('dictionary type "user" is deprecated from version1.6. You just give path to dictionary csv.') raise Exception('dictionary type "all" is deprecated from version1.6. You just give path to dictionary csv.') if pathUserDictCsv is not None and isinstance(pathUserDictCsv, text_type) and pathUserDictCsv != '': assert os.path.exists(pathUserDictCsv), \ 'Your user dictionary does NOT exist. Path={}'.format(pathUserDictCsv) def __get_path_to_mecab_config(self): """You get path into mecab-config """ if six.PY2: path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config']) path_mecab_config_dir = path_mecab_config_dir.strip().replace('/mecab-config', '') else: path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config']).decode(self.string_encoding) path_mecab_config_dir = path_mecab_config_dir.strip().replace('/mecab-config', '') logger.info(msg='mecab-config is detected at {}'.format(path_mecab_config_dir)) return path_mecab_config_dir def __check_mecab_dict_path(self): """check path to dict of Mecab in system environment """ mecab_dic_cmd = "echo `{} --dicdir`".format(os.path.join(self._path_mecab_config, 'mecab-config')) try: if six.PY2: path_mecab_dict = subprocess.check_output( mecab_dic_cmd, shell=True ).strip('\n') else: path_mecab_dict = subprocess.check_output(mecab_dic_cmd, shell=True).decode(self.string_encoding).strip('\n') except subprocess.CalledProcessError: logger.error("{}".format(mecab_dic_cmd)) raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config command") if path_mecab_dict == '': raise SystemError("""mecab dictionary path is not found with following command: {} You are not able to use additional dictionary. Still you are able to call mecab default dictionary""".format(mecab_dic_cmd)) return path_mecab_dict def __check_mecab_libexe(self): mecab_libexe_cmd = "echo `{} --libexecdir`".format(os.path.join(self._path_mecab_config, 'mecab-config')) try: if six.PY2: path_mecab_libexe = subprocess.check_output( mecab_libexe_cmd, shell=True ).strip('\n') else: path_mecab_libexe = subprocess.check_output(mecab_libexe_cmd, shell=True).decode(self.string_encoding).strip('\n') except subprocess.CalledProcessError: logger.error("{}".format(mecab_libexe_cmd)) raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config --libexecdir") if path_mecab_libexe == '': raise SystemError("""Mecab config is not callable with following command: {} You are not able to compile your user dictionary. Still, you are able to use default mecab dictionary.""".format(mecab_libexe_cmd)) return path_mecab_libexe def __CallMecab(self): if self._path_dictionary is not None and self._mecab_dictionary_path is None: logger.debug('Use dictionary you specified.') cmMecabInitialize = '-d {}'.format(self._path_dictionary) elif self._dictType == 'neologd': # use neologd logger.debug('Use neologd additional dictionary') cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "mecab-ipadic-neologd")) elif self._dictType == 'ipadic' or self._dictType == 'ipaddic': # use ipadic logger.debug('Use ipadic dictionary') cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "ipadic")) elif six.PY2 is False and self._dictType == 'jumandic': # use jumandic. This is impossible to call in Python2.x logger.debug('Use jumandic dictionary') cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "jumandic")) elif six.PY2 and self._dictType == 'jumandic': raise Exception('In python2.x, impossible to call jumandic.') else: logger.debug('Use no default dictionary') cmMecabInitialize = '' # execute compile if user dictionary is given if self._pathUserDictCsv is not None: logger.debug('Use User dictionary') pathUserDict = self.__CompileUserdict() cmMecabInitialize += ' -u {}'.format(pathUserDict) if six.PY2: cmMecabCall = "-Ochasen {}".format(cmMecabInitialize) else: cmMecabCall = "{}".format(cmMecabInitialize) logger.debug(msg="mecab initialized with {}".format(cmMecabCall)) try: mecabObj = MeCab.Tagger(cmMecabCall) except Exception as e: logger.error(e.args) logger.error("Possibly Path to userdict is invalid. Check the path") raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to initialize Mecab object") return mecabObj def __CompileUserdict(self): """* What you can do """ path_mecab_dict = self.__check_mecab_dict_path() path_mecab_libexe = self.__check_mecab_libexe() cmCompileDict = u'{0}/mecab-dict-index -d {1}/ipadic -u {2} -f utf-8 -t utf-8 {3} > /dev/null'.format(path_mecab_libexe, path_mecab_dict, self._pathUserDictCsv.replace("csv", "dict"), self._pathUserDictCsv) logger.debug(msg="compiling mecab user dictionary with: {}".format(cmCompileDict)) try: subprocess.call( cmCompileDict , shell=True ) except OSError as e: logger.error('type:' + str(type(e))) logger.error('args:' + str(e.args)) sys.exit('Failed to compile mecab userdict. System ends') return self._pathUserDictCsv.replace("csv", "dict") def __feature_parser(self, uni_feature, word_surface): """ Parse the POS feature output by Mecab :param uni_feature unicode: :return ( (pos1, pos2, pos3), word_stem ): """ list_feature_items = uni_feature.split((',')) # if word has no feature at all if len(list_feature_items)==1: return ('*'), ('*') pos1 = list_feature_items[0] pos2 = list_feature_items[1] pos3 = list_feature_items[2] tuple_pos = ( pos1, pos2, pos3 ) # if without constraint(output is normal mecab dictionary like) if len(list_feature_items) == 9: word_stem = list_feature_items[6] # if with constraint(output format depends on Usedict.txt) else: word_stem = word_surface return tuple_pos, word_stem def __postprocess_analyzed_result(self, string_mecab_parsed_result, is_feature, is_surface): # type: (text_type,bool,bool)->List[TokenizedResult] """Extract surface word and feature from analyzed lines. Extracted results are returned with list, whose elements are TokenizedResult class [TokenizedResult] """ assert isinstance(string_mecab_parsed_result, str) check_tab_separated_line = lambda x: True if '\t' in x else False tokenized_objects = [ self.__result_parser(analyzed_line=analyzed_line, is_feature=is_feature, is_surface=is_surface) for analyzed_line in string_mecab_parsed_result.split('\n') if not analyzed_line=='EOS' and check_tab_separated_line(analyzed_line) ] assert isinstance(tokenized_objects, list) return tokenized_objects def __result_parser(self, analyzed_line, is_feature, is_surface): # type: (text_type,bool,bool)->TokenizedResult """Extract surface word and feature from analyzed line. Extracted elements are returned with TokenizedResult class """ assert isinstance(analyzed_line, str) assert isinstance(is_feature, bool) assert isinstance(is_surface, bool) surface, features = analyzed_line.split('\t', 1) tuple_pos, word_stem = self.__feature_parser(features, surface) tokenized_obj = TokenizedResult( node_obj=None, analyzed_line=analyzed_line, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=surface, is_feature=is_feature, is_surface=is_surface ) return tokenized_obj def tokenize(self, sentence, normalized=True, is_feature=False, is_surface=False, return_list=False, func_normalizer=normalize_text): # type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence] """* What you can do - Call mecab tokenizer, and return tokenized objects """ if six.PY2 and isinstance(sentence, str): sentence = sentence.decode(self.string_encoding) else: pass # decide normalization function depending on dictType if func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid: normalized_sentence = neologdn.normalize(sentence) elif func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid == False: raise Exception("You could not call neologd dictionary bacause you do NOT install the package neologdn.") elif func_normalizer == normalize_text: normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType) elif func_normalizer is None: normalized_sentence = sentence else: normalized_sentence = func_normalizer(sentence) # don't delete this variable. The variable "encoded_text" protects sentence from deleting if six.PY2: encoded_text = normalized_sentence.encode(self.string_encoding) else: encoded_text = normalized_sentence if six.PY2: tokenized_objects = [] node = self.mecabObj.parseToNode(encoded_text) node = node.next while node.next is not None: word_surface = node.surface.decode(self.string_encoding) tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface) tokenized_obj = TokenizedResult( node_obj=node, tuple_pos=tuple_pos, word_stem=word_stem, word_surface=word_surface, is_feature=is_feature, is_surface=is_surface ) tokenized_objects.append(tokenized_obj) node = node.next tokenized_sentence = TokenizedSenetence( sentence=sentence, tokenized_objects=tokenized_objects) else: parsed_result = self.mecabObj.parse(encoded_text) tokenized_objects = self.__postprocess_analyzed_result( string_mecab_parsed_result=parsed_result, is_feature=is_feature, is_surface=is_surface ) tokenized_sentence = TokenizedSenetence( sentence=sentence, tokenized_objects=tokenized_objects ) # type: TokenizedSenetence if return_list: return tokenized_sentence.convert_list_object() else: return tokenized_sentence def filter(self, parsed_sentence, pos_condition=None, stopwords=None): # type: (TokenizedSenetence, List[Tuple[str,...]], List[str]) -> FilteredObject assert isinstance(parsed_sentence, TokenizedSenetence) assert isinstance(pos_condition, (type(None), list)) assert isinstance(stopwords, (type(None), list)) return parsed_sentence.filter(pos_condition, stopwords) ================================================ FILE: JapaneseTokenizer/object_models.py ================================================ #! -*- coding: utf-8 -*- from typing import Callable from six import text_type class WrapperBase(object): def tokenize(self, sentence, normalize, is_feature, is_surface, return_list, func_normalizer=None): # type: (text_type, bool, bool, bool, bool, Callable[[text_type], text_type])->None """* What you can do""" raise NotImplemented def filter(self, parsed_sentence, pos_condition=None, stopwords=None): raise NotImplemented ================================================ FILE: LICENSE.txt ================================================ Copyright 2017 Kensuke Mitsuzawa Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ include README.md include README_JP.md include examples include test include install_tokenizers.sh include LICENSE.txt include Makefile ================================================ FILE: Makefile ================================================ install: bash install_tokenizers.sh install_neologd: ## mecab-neologdのインストールを実行 wget --no-check-certificate https://github.com/neologd/mecab-ipadic-neologd/tarball/master -O mecab-ipadic-neologd.tar tar -xvf mecab-ipadic-neologd.tar mv neologd-mecab-ipadic-neologd-* neologd-mecab-ipadic-neologd && cd neologd-mecab-ipadic-neologd && ( echo yes | ./bin/install-mecab-ipadic-neologd ) ================================================ FILE: README.md ================================================ [![MIT License](http://img.shields.io/badge/license-MIT-blue.svg?style=flat)](LICENSE)[![Build Status](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers.svg?branch=master)](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers) # What's this? This is simple python-wrapper for Japanese Tokenizers(A.K.A Tokenizer) This project aims to call tokenizers and split a sentence into tokens as easy as possible. And, this project supports various Tokenization tools common interface. Thus, it's easy to compare output from various tokenizers. This project is available also in [Github](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers). If you find any bugs, please report them to github issues. Or any pull requests are welcomed! # Requirements - Python 2.7 - Python 3.x - checked in 3.5, 3.6, 3.7 # Features * simple/common interface among various tokenizers * simple/common interface for filtering with stopwords or Part-of-Speech condition * simple interface to add user-dictionary(mecab only) ## Supported Tokenizers ### Mecab [Mecab](http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html?sess=3f6a4f9896295ef2480fa2482de521f6) is open source tokenizer system for various language(if you have dictionary for it) See [english documentation](https://github.com/jordwest/mecab-docs-en) for detail ### Juman [Juman](http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?JUMAN) is a tokenizer system developed by Kurohashi laboratory, Kyoto University, Japan. Juman is strong for ambiguous writing style in Japanese, and is strong for new-comming words thanks to Web based huge dictionary. And, Juman tells you semantic meaning of words. ### Juman++ [Juman++](http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?JUMAN++) is a tokenizer system developed by Kurohashi laboratory, Kyoto University, Japan. Juman++ is succeeding system of Juman. It adopts RNN model for tokenization. Juman++ is strong for ambigious writing style in Japanese, and is strong for new-comming words thanks to Web based huge dictionary. And, Juman tells you semantic meaning of words. Note: New Juman++ dev-version(later than 2.x) is available at [Github](https://github.com/ku-nlp/jumanpp) ### Kytea [Kytea](http://www.phontron.com/kytea/) is tokenizer tool developped by Graham Neubig. Kytea has a different algorithm from one of Mecab or Juman. # Setting up ## Tokenizers auto-install ``` make install ``` ### mecab-neologd dictionary auto-install ``` make install_neologd ``` ## Tokenizers manual-install ### MeCab See [here](https://github.com/jordwest/mecab-docs-en) to install MeCab system. ### Mecab Neologd dictionary Mecab-neologd dictionary is a dictionary-extension based on ipadic-dictionary, which is basic dictionary of Mecab. With, Mecab-neologd dictionary, you're able to parse new-coming words make one token. Here, new-coming words is such like, movie actor name or company name..... See [here](https://github.com/neologd/mecab-ipadic-neologd) and install mecab-neologd dictionary. ### Juman ``` wget -O juman7.0.1.tar.bz2 "http://nlp.ist.i.kyoto-u.ac.jp/DLcounter/lime.cgi?down=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2&name=juman-7.01.tar.bz2" bzip2 -dc juman7.0.1.tar.bz2 | tar xvf - cd juman-7.01 ./configure make [sudo] make install ``` ## Juman++ * GCC version must be >= 5 ``` wget http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz tar xJvf jumanpp-1.02.tar.xz cd jumanpp-1.02/ ./configure make [sudo] make install ``` ## Kytea Install Kytea system ``` wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz tar -xvf kytea-0.4.7.tar cd kytea-0.4.7 ./configure make make install ``` Kytea has [python wrapper](https://github.com/chezou/Mykytea-python) thanks to michiaki ariga. Install Kytea-python wrapper ``` pip install kytea ``` ## install ``` [sudo] python setup.py install ``` ### Note During install, you see warning message when it fails to install `pyknp` or `kytea`. if you see these messages, try to re-install these packages manually. # Usage Tokenization Example(For python3.x. To see exmaple code for Python2.x, plaese see [here](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers/blob/master/examples/examples.py)) ``` import JapaneseTokenizer input_sentence = '10日放送の「中居正広のミになる図書館」(テレビ朝日系)で、SMAPの中居正広が、篠原信一の過去の勘違いを明かす一幕があった。' # ipadic is well-maintained dictionary # mecab_wrapper = JapaneseTokenizer.MecabWrapper(dictType='ipadic') print(mecab_wrapper.tokenize(input_sentence).convert_list_object()) # neologd is automatically-generated dictionary from huge web-corpus # mecab_neologd_wrapper = JapaneseTokenizer.MecabWrapper(dictType='neologd') print(mecab_neologd_wrapper.tokenize(input_sentence).convert_list_object()) ``` ## Filtering example ``` import JapaneseTokenizer # with word filtering by stopword & part-of-speech condition # print(mecab_wrapper.tokenize(input_sentence).filter(stopwords=['テレビ朝日'], pos_condition=[('名詞', '固有名詞')]).convert_list_object()) ``` ## Part-of-speech structure Mecab, Juman, Kytea have different system of Part-of-Speech(POS). You can check tables of Part-of-Speech(POS) [here](http://www.unixuser.org/~euske/doc/postag/) # Similar Package ## natto-py natto-py is sophisticated package for tokenization. It supports following features * easy interface for tokenization * importing additional dictionary * partial parsing mode # LICENSE MIT license # For developers You could build an environment which has dependencies to test this package. Simply, you build docker image and run docker container. ## Dev environment Develop environment is defined with `test/docker-compose-dev.yml`. With the docker-compose.yml file, you could call python2.7 or python3.7 If you're using Pycharm Professional edition, you could set docker-compose.yml as remote interpreter. To call python2.7, set `/opt/conda/envs/p27/bin/python2.7` To call python3.7, set `/opt/conda/envs/p37/bin/python3.7` ## Test environment These commands checks from procedures of package install until test of package. ```bash $ docker-compose build $ docker-compose up ``` ================================================ FILE: examples/examples.py ================================================ #! -*- coding: utf-8 -*- import sys import os from JapaneseTokenizer import JumanWrapper from JapaneseTokenizer import JumanppWrapper from JapaneseTokenizer import MecabWrapper from JapaneseTokenizer import KyteaWrapper from JapaneseTokenizer.datamodels import TokenizedResult from JapaneseTokenizer import init_logger import logging import socket import six logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME)) __author__ = 'kensuke-mi' logger.setLevel(logging.DEBUG) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # for python2.x def basic_example(): # ======================================================== # TOKENIZE # ======================================================== if six.PY2: # input is `unicode` type(in python2x) sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' elif six.PY3: sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' else: raise Exception() # make MecabWrapper object # you can choose from "neologd", "all", "ipadic", "user", "", None # "ipadic" and "" is equivalent mecab_wrapper = MecabWrapper(dictType="neologd") juman_wrapper = JumanWrapper() jumanpp_wrapper = JumanppWrapper() #kytea_wrapper = KyteaWrapper() # tokenize sentence into list of token. # with is_feature=True, you get part-of-speech tag also. in this case, you get tuple ( token, (part-of-speech-tags) ) # with is_surface=True, you get surface form of token (in other words, not normalized token) seq_tokens_mecab = mecab_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object() seq_tokens_juman = juman_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object() seq_tokens_jumanpp = jumanpp_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object() #seq_tokens_kytea = kytea_wrapper.tokenize(sentence=sentence, is_feature=True, is_surface=False).convert_list_object() logger.debug(seq_tokens_mecab) logger.debug(seq_tokens_juman) logger.debug(seq_tokens_jumanpp) #logger.debug(seq_tokens_kytea) def filtering_example(): if six.PY2: # input is `unicode` type(in python2x) sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' stopwords = [u'テヘラン'] pos_condition_ipadic = [(u'名詞', u'固有名詞'), (u'名詞', u'一般')] pos_condition_juman = [(u'名詞', u'固有名詞'), (u'名詞', u'普通名詞')] pos_condition_kytea = [(u'名詞',)] elif six.PY3: sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' stopwords = ['テヘラン'] pos_condition_ipadic = [('名詞', '固有名詞'), ('名詞', '一般')] pos_condition_juman = [('名詞', '固有名詞'), ('名詞', '普通名詞')] pos_condition_kytea = [('名詞',)] else: raise Exception() # ======================================================== # FILTERING # ======================================================== # you can filter tokens by stopwords or POS conditions # stopword is list objetc mecab_wrapper = MecabWrapper(dictType="neologd") juman_wrapper = JumanWrapper() jumanpp_wrapper = JumanppWrapper() #kytea_wrapper = KyteaWrapper() seq_tokens_mecab = mecab_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_ipadic,stopwords=stopwords).convert_list_object() seq_tokens_juman = juman_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_juman, stopwords=stopwords).convert_list_object() seq_tokens_jumanpp = jumanpp_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_juman, stopwords=stopwords).convert_list_object() #seq_tokens_kytea = kytea_wrapper.tokenize(sentence=sentence, is_feature=True, is_surface=False).filter(pos_condition=pos_condition_kytea, stopwords=stopwords).convert_list_object() logger.debug(seq_tokens_mecab) logger.debug(seq_tokens_juman) logger.debug(seq_tokens_jumanpp) #logger.debug(seq_tokens_kytea) def advanced_example_mecab(): if six.PY2: sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' elif six.PY3: sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' else: raise Exception() # ======================================================== # USE YOUE OWN DICTIONARY # with your own dictionary, you can force Mecab to make some word into one token # ======================================================== # make your own "user dictionary" with CSV file # To know more about this file, see this page(sorry, Japanese only) https://mecab.googlecode.com/svn/trunk/mecab/doc/dic.html example_user_dict = "userdict.csv" # set dictType='user' or dictType='all' and set pathUserDictCsv tokenized_obj = MecabWrapper(dictType='user', pathUserDictCsv=example_user_dict).tokenize(sentence) for token_obj in tokenized_obj.tokenized_objects: assert isinstance(token_obj, TokenizedResult) if six.PY2 and token_obj.word_stem == u'ペルシア語': logger.debug(token_obj.word_stem) elif six.PY3 and token_obj.word_stem == 'ペルシア語': logger.debug(token_obj.word_stem) ## TokenizedResult class has attributes of tokenized result ## token_obj.analyzed_line token_obj.word_surface token_obj.word_stem token_obj.tuple_pos def advanced_example_juman(): if six.PY2: sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' pos_condition = [(u'名詞',)] elif six.PY3: sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。' pos_condition = [('名詞',)] else: raise Exception() ### You can call juman with server mode. You must start JUMAN as server mode beforehand ### s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST='localhost' PORT = 32000 try: s.connect((HOST, PORT)) s.close() juman_wrapper = JumanWrapper(server=HOST, port=PORT) tokens_list = juman_wrapper.tokenize(sentence, return_list=False).filter(pos_condition=pos_condition).convert_list_object() assert isinstance(tokens_list, list) except: logger.info(msg='Juman server is not running. Skip it.') if __name__ == "__main__": basic_example() filtering_example() advanced_example_mecab() advanced_example_juman() ================================================ FILE: examples/userdict.csv ================================================ ペルシア語,-1,-1,-400,名詞,一般,*,*,*,*,ぺるしあご,*,*,* ================================================ FILE: install_tokenizers.sh ================================================ #!/bin/bash os_type=`uname` echo "os-type is "$os_type if [ `uname` = "Darwin" ]; then #mac用のコード juman_utils_bin="/usr/local/opt/juman/libexec/juman/" if [ -e ${juman_utils_bin} ]; then : else juman_utils_bin="/usr/local/libexec/juman/" fi elif [ `uname` = "Linux" ]; then #Linux用のコード juman_utils_bin="/usr/local/libexec/juman/" else echo "Your platform ($(uname -a)) is not supported." exit 1 fi WORK_DIR=`pwd` echo 'これはテスト' | mecab is_mecab_install=$? if [ $is_mecab_install -eq 127 ]; then ## mecab wget -O mecab-0.996.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE" tar zxvf mecab-0.996.tar.gz cd mecab-0.996 && ./configure && make && make install cd $WORK_DIR ### mecabインストール後にldconfigを実行 ldconfig ## mecab ipadic wget -O mecab-ipadic-2.7.0-20070801.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM" tar zxvf mecab-ipadic-2.7.0-20070801.tar.gz cd mecab-ipadic-2.7.0-20070801 &&./configure --with-charset=utf8 && make && make install # 動作テスト echo 'インストール後のテスト' | mecab else : fi echo 'これはテスト' | juman is_juman_install=$? if [ $is_juman_install -eq 127 ]; then ## juman wget -O juman7.0.1.tar.bz2 "http://nlp.ist.i.kyoto-u.ac.jp/DLcounter/lime.cgi?down=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2&name=juman-7.01.tar.bz2" bzip2 -dc juman7.0.1.tar.bz2 | tar xvf - cd juman-7.01 && ./configure && make && make install # インストール後のldconfig ldconfig # 動作テスト echo 'インストール後のテスト' | juman else : fi echo 'これはテスト' | jumanpp is_jumanpp_install=$? if [ $is_jumanpp_install -eq 127 ]; then # jumanpp wget http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.01.tar.xz tar xJvf jumanpp-1.01.tar.xz cd jumanpp-1.01/ && ./configure && make && make install # todo jumanppのサーバー起動スクリプト実施 # インストール後のldconfig ldconfig # 動作テスト echo 'インストール後のテスト' | jumanpp else : fi echo 'これはテスト' | kytea is_kytea_install=$? if [ $is_kytea_install -eq 127 ]; then # kytea wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz -O kytea-0.4.7.tar.gz tar -xvf kytea-0.4.7.tar.gz cd kytea-0.4.7 && ./configure && make && make install # インストール後のldconfig ldconfig # 動作テスト echo 'インストール後のテスト' | kytea else : fi if [ -f ./juman7.0.1.tar.bz2 ]; then # juman rm juman7.0.1.tar.bz2 else : fi if [ -f ./mecab-*.tar.gz ]; then # juman rm mecab-*.tar.gz else : fi if [ -f ./mecab-ipadic-*.tar.gz ]; then # mecab-ipadic rm mecab-ipadic-*.tar.gz else : fi if [ -f ./jumanpp-1.01.tar.xz ]; then # jumanpp rm jumanpp-1.01.tar.xz else : fi if [ -f ./kytea-0.4.7.tar ]; then # kytea rm kytea-0.4.7.tar else : fi if [ -d ./juman-7* ]; then # kytea rm -rf juman-7* else : fi if [ -d ./mecab-0* ]; then # kytea rm -rf mecab-0* else : fi if [ -d ./mecab-ipadic-* ]; then rm -rf mecab-ipadic-* else : fi if [ -d ./jumanpp-1.01 ]; then rm -rf jumanpp-1.01 else : fi if [ -d ./kytea-0.4.7 ]; then rm -rf kytea-0.4.7 else : fi ================================================ FILE: setup.py ================================================ #! -*- coding: utf-8 -*- from setuptools import setup, find_packages import sys import logging import codecs logger = logging.getLogger(__file__) python_version = sys.version_info # -------------------------------------------------------------------------------------------------------- # try to install kytea automatically because it usually causes to error during installing try: import Mykytea except ImportError: try: import sys import subprocess subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kytea']) import Mykytea except Exception as e: logger.error('We failed to install mykytea automatically. Try installing kytea manually.') logger.error(e) # -------------------------------------------------------------------------------------------------------- try: import neologdn except ImportError: try: import sys import subprocess subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'neologdn']) import neologdn except Exception as e: logger.error('We failed to install neologdn automatically because of some issues in the package. Try installing pyknp manually.') logger.error(e) # -------------------------------------------------------------------------------------------------------- common_packages = ['pypandoc', 'future', 'six', 'jaconv>=0.2', 'pip>=8.1.0', 'pexpect', 'pyknp>=0.4.1'] if python_version >= (3, 0, 0): if python_version <= (3, 5, 0): common_packages.append('typing') elif python_version > (3, 5, 0): common_packages.append('mecab-python3') elif python_version <= (2, 9, 9): common_packages.append('typing') common_packages.append('mecab-python') else: raise NotImplementedError() version = '1.6' name = 'JapaneseTokenizer' short_description = '`JapaneseTokenizer` is a package for easy Japanese Tokenization' try: import pypandoc long_description = pypandoc.convert('README.md', 'rst') except(IOError, ImportError): long_description = codecs.open('README.md', 'r', 'utf-8').read() classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: MIT License", "Programming Language :: Python", "Natural Language :: Japanese", "Topic :: Scientific/Engineering :: Artificial Intelligence", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3.5" ] setup( author='Kensuke Mitsuzawa', author_email='kensuke.mit@gmail.com', name = name, version=version, short_description=short_description, long_description=long_description, keywords=['MeCab', '和布蕪', 'Juman', 'Japanese morphological analyzer', 'NLP', '形態素解析', '自然言語処理'], license="MIT", url = "https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers", test_suite='test.test_all.suite', install_requires=common_packages, tests_require=common_packages, packages=find_packages() ) ================================================ FILE: test/Dockerfile ================================================ FROM frolvlad/alpine-glibc:alpine-3.6 MAINTAINER kensuke-mi # Mecab install ENV MECAB_VERSION 0.996 ENV IPADIC_VERSION 2.7.0-20070801 ENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE ENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM ENV build_deps 'curl git bash file sudo openssh gcc make build-base' ENV dependencies 'openssl' ENV PATH=/opt/conda/bin:$PATH \ LANG=C.UTF-8 \ MINICONDA=Miniconda3-latest-Linux-x86_64.sh # apk update RUN apk update # mecab RUN apk add --update --no-cache ${build_deps} \ # Install dependencies && apk add --update --no-cache ${dependencies} \ # Install MeCab && curl -SL -o mecab-${MECAB_VERSION}.tar.gz ${mecab_url} \ && tar zxf mecab-${MECAB_VERSION}.tar.gz \ && cd mecab-${MECAB_VERSION} \ && ./configure --enable-utf8-only --with-charset=utf8 \ && make \ && make install \ && cd \ # Install IPA dic && curl -SL -o mecab-ipadic-${IPADIC_VERSION}.tar.gz ${ipadic_url} \ && tar zxf mecab-ipadic-${IPADIC_VERSION}.tar.gz \ && cd mecab-ipadic-${IPADIC_VERSION} \ && ./configure --with-charset=utf8 \ && make \ && make install \ && cd \ # Install Neologd && git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \ && mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \ && rm -rf \ mecab-${MECAB_VERSION}* \ mecab-${IPADIC_VERSION}* \ mecab-ipadic-neologd # general RUN apk --no-cache add vim \ wget \ lsof \ curl \ bash \ swig \ gcc \ build-base \ make \ python-dev \ py-pip \ jpeg-dev \ zlib-dev \ git \ linux-headers ENV LIBRARY_PATH=/lib:/usr/lib ENV PLANTUML_VERSION 1.2017.18 ENV PLANTUML_DOWNLOAD_URL https://sourceforge.net/projects/plantuml/files/plantuml.$PLANTUML_VERSION.jar/download ENV PANDOC_VERSION 1.19.2.4 ENV PANDOC_DOWNLOAD_URL https://hackage.haskell.org/package/pandoc-$PANDOC_VERSION/pandoc-$PANDOC_VERSION.tar.gz ENV PANDOC_ROOT /usr/local/pandoc ENV PATH $PATH:$PANDOC_ROOT/bin # Create Pandoc build space RUN mkdir -p /pandoc-build WORKDIR /pandoc-build # Install/Build Packages RUN apk upgrade --update && \ apk add --no-cache --virtual .build-deps $BUILD_DEPS && \ apk add --no-cache --virtual .persistent-deps $PERSISTENT_DEPS && \ curl -fsSL "$PLANTUML_DOWNLOAD_URL" -o /usr/local/plantuml.jar && \ apk add --no-cache --virtual .edge-deps $EDGE_DEPS -X http://dl-cdn.alpinelinux.org/alpine/edge/community && \ curl -fsSL "$PANDOC_DOWNLOAD_URL" | tar -xzf - && \ ( cd pandoc-$PANDOC_VERSION && cabal update && cabal install --only-dependencies && \ cabal configure --prefix=$PANDOC_ROOT && \ cabal build && \ cabal copy && \ cd .. ) && \ rm -Rf pandoc-$PANDOC_VERSION/ && \ rm -Rf /root/.cabal/ /root/.ghc/ && \ rmdir /pandoc-build && \ set -x; \ addgroup -g 82 -S www-data; \ adduser -u 82 -D -S -G www-data www-data && \ mkdir -p /var/docs && \ apk del .build-deps .edge-deps # Juman RUN wget http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2 \ && tar xvf juman-7.01.tar.bz2 \ && cd juman-7.01 \ && ./configure \ && make \ && make install \ && cd .. \ && rm -rf juman-7.01 \ && rm juman-7.01.tar.bz2 # Juman++ RUN apk add --update --no-cache --virtual=build-deps \ boost-dev g++ make \ && wget -q http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz \ && tar Jxfv jumanpp-1.02.tar.xz \ && cd jumanpp-1.02/ \ && ./configure \ && make \ && make install \ && cd .. \ && rm jumanpp-1.02.tar.xz \ && rm -rf /var/cache/* \ && apk del build-deps \ && apk add --update --no-cache boost # kytea RUN wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz \ && tar -xvf kytea-0.4.7.tar.gz \ && cd kytea-0.4.7 \ && ./configure \ && make \ && make install # Python RUN apk add --no-cache bash wget && \ wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \ bash $MINICONDA -b -p /opt/conda && \ ln -s /opt/conda/bin/* /usr/local/bin/ && \ rm -rf /root/.[acpw]* $MINICONDA /opt/conda/pkgs/* RUN conda config --add channels conda-forge --system RUN conda create -y -n p27 python=2.7 RUN conda create -y -n p36 python=3.6 RUN conda create -y -n p37 python=3.7 #RUN source activate p27 #RUN source deactivate CMD ["/bin/bash"] ================================================ FILE: test/Dockerfile-dev ================================================ FROM frolvlad/alpine-glibc:alpine-3.6 MAINTAINER kensuke-mi # Mecab install ENV MECAB_VERSION 0.996 ENV IPADIC_VERSION 2.7.0-20070801 ENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE ENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM ENV jumandic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM ENV unidic_url https://unidic.ninjal.ac.jp/unidic_archive/cwj/2.3.0/unidic-cwj-2.3.0.zip ENV build_deps 'curl git bash file sudo openssh gcc make build-base' ENV dependencies 'openssl' ENV PATH=/opt/conda/bin:$PATH \ LANG=C.UTF-8 \ MINICONDA=Miniconda3-latest-Linux-x86_64.sh # apk update RUN apk update # mecab RUN apk add --update --no-cache ${build_deps} \ # Install dependencies && apk add --update --no-cache ${dependencies} \ # Install MeCab && curl -SL -o mecab-${MECAB_VERSION}.tar.gz ${mecab_url} \ && tar zxf mecab-${MECAB_VERSION}.tar.gz \ && cd mecab-${MECAB_VERSION} \ && ./configure --enable-utf8-only --with-charset=utf8 \ && make \ && make install \ && cd \ # Install IPA dic && curl -SL -o mecab-ipadic-${IPADIC_VERSION}.tar.gz ${ipadic_url} \ && tar zxf mecab-ipadic-${IPADIC_VERSION}.tar.gz \ && cd mecab-ipadic-${IPADIC_VERSION} \ && ./configure --with-charset=utf8 \ && make \ && make install \ && cd \ # Install Neologd && git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \ && mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \ # Install jumandic && curl -SL -o jumandic.tar.gz ${jumandic_url} \ && tar zxf jumandic.tar.gz \ && cd mecab-jumandic-7.0-20130310 \ && ./configure --with-charset=utf8 \ && make \ && make install \ # delete dictionary files && cd \ && rm -rf \ mecab-${MECAB_VERSION}* \ mecab-${IPADIC_VERSION}* \ mecab-ipadic-neologd \ mecab-jumandic-7.0-20130310 # general RUN apk --no-cache add vim \ wget \ lsof \ curl \ bash \ swig \ gcc \ build-base \ make \ python-dev \ py-pip \ jpeg-dev \ zlib-dev \ git \ linux-headers ENV LIBRARY_PATH=/lib:/usr/lib ENV PLANTUML_VERSION 1.2017.18 ENV PLANTUML_DOWNLOAD_URL https://sourceforge.net/projects/plantuml/files/plantuml.$PLANTUML_VERSION.jar/download ENV PANDOC_VERSION 1.19.2.4 ENV PANDOC_DOWNLOAD_URL https://hackage.haskell.org/package/pandoc-$PANDOC_VERSION/pandoc-$PANDOC_VERSION.tar.gz ENV PANDOC_ROOT /usr/local/pandoc ENV PATH $PATH:$PANDOC_ROOT/bin # Create Pandoc build space RUN mkdir -p /pandoc-build WORKDIR /pandoc-build # Install/Build Packages RUN apk upgrade --update && \ apk add --no-cache --virtual .build-deps $BUILD_DEPS && \ apk add --no-cache --virtual .persistent-deps $PERSISTENT_DEPS && \ curl -fsSL "$PLANTUML_DOWNLOAD_URL" -o /usr/local/plantuml.jar && \ apk add --no-cache --virtual .edge-deps $EDGE_DEPS -X http://dl-cdn.alpinelinux.org/alpine/edge/community && \ curl -fsSL "$PANDOC_DOWNLOAD_URL" | tar -xzf - && \ ( cd pandoc-$PANDOC_VERSION && cabal update && cabal install --only-dependencies && \ cabal configure --prefix=$PANDOC_ROOT && \ cabal build && \ cabal copy && \ cd .. ) && \ rm -Rf pandoc-$PANDOC_VERSION/ && \ rm -Rf /root/.cabal/ /root/.ghc/ && \ rmdir /pandoc-build && \ set -x; \ addgroup -g 82 -S www-data; \ adduser -u 82 -D -S -G www-data www-data && \ mkdir -p /var/docs && \ apk del .build-deps .edge-deps # Juman RUN wget http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2 \ && tar xvf juman-7.01.tar.bz2 \ && cd juman-7.01 \ && ./configure \ && make \ && make install \ && cd .. \ && rm -rf juman-7.01 \ && rm juman-7.01.tar.bz2 # Juman++ RUN apk add --update --no-cache --virtual=build-deps \ boost-dev g++ make \ && wget -q http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz \ && tar Jxfv jumanpp-1.02.tar.xz \ && cd jumanpp-1.02/ \ && ./configure \ && make \ && make install \ && cd .. \ && rm jumanpp-1.02.tar.xz \ && rm -rf /var/cache/* \ && apk del build-deps \ && apk add --update --no-cache boost # kytea RUN wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz \ && tar -xvf kytea-0.4.7.tar.gz \ && cd kytea-0.4.7 \ && ./configure \ && make \ && make install # Python RUN apk add --no-cache bash wget && \ wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \ bash $MINICONDA -b -p /opt/conda && \ ln -s /opt/conda/bin/* /usr/local/bin/ && \ rm -rf /root/.[acpw]* $MINICONDA /opt/conda/pkgs/* RUN conda config --add channels conda-forge --system RUN conda create -y -n p27 python=2.7 RUN conda create -y -n p37 python=3.7 RUN mkdir /code RUN mkdir /code/dev COPY requirements_py2.txt /code/dev/requirements_py2.txt COPY requirements_py3.txt /code/dev/requirements_py3.txt RUN source activate p27 && pip install -r /code/dev/requirements_py2.txt RUN source deactivate RUN source activate p37 && pip install -r /code/dev/requirements_py3.txt RUN source deactivate CMD ["/bin/bash"] ================================================ FILE: test/__init__.py ================================================ __author__ = 'kensuke-mi' ================================================ FILE: test/common/__init__.py ================================================ ================================================ FILE: test/common/test_server_handler.py ================================================ #! -*- coding: utf-8 -*- # test module from JapaneseTokenizer.common import sever_handler # client module import six if six.PY2: from JapaneseTokenizer.jumanpp_wrapper.__jumanpp_wrapper_python2 import JumanppWrapper else: from JapaneseTokenizer.jumanpp_wrapper.__jumanpp_wrapper_python3 import JumanppWrapper # else import sys import unittest import os import time __author__ = 'kensuke-mi' class TestServerHandler(unittest.TestCase): @classmethod def setUpClass(cls): if six.PY3: cls.test_senetence = '紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優。' else: cls.test_senetence = u'紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優。' cls.jumanpp_command = "/usr/local/bin/jumanpp" def test_jumanpp_process_hanlder_normal(self): """It tests jumanpp process handler""" # normal test # jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command) result_jumanpp_analysis = jumanpp_process_handler.query(input_string=self.test_senetence) self.assertTrue(isinstance(result_jumanpp_analysis,six.text_type)) ## stop process ## jumanpp_process_handler.stop_process() ## delete instance ## del jumanpp_process_handler def test_jumanpp_process_handler_timeout_exception(self): """It tests the case which causes timeout exception""" with self.assertRaises(Exception) as exc: jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command, timeout_second=1) result_jumanpp_analysis = jumanpp_process_handler.query(input_string=self.test_senetence*100) exception_message = exc.exception jumanpp_process_handler.stop_process() def test_jumanpp_process_handler_init_exception(self): with self.assertRaises(Exception) as exc: jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command='hoge', timeout_second=1) exception_message = exc.exception def test_jumanpp_process_handler_huge_request(self): """It tests the case where a user sends too much request""" input_huge_request = [self.test_senetence] * 100 jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command) seq_result_jumanpp_analysis = [jumanpp_process_handler.query(input_string=sentence) for sentence in input_huge_request] self.assertTrue(isinstance(seq_result_jumanpp_analysis, list)) if __name__ == '__main__': unittest.main() ================================================ FILE: test/docker-compose-dev.yml ================================================ # 開発/test環境としてまとめてdocker環境を整えるためのcompose version: '3' services: dev_env_py2: build: context: ./ dockerfile: Dockerfile-dev volumes: - ..:/codes/ stdin_open: true tty: true command: bash -c "source /opt/conda/bin/activate p27 && pip install -r requirements_py2.txt" dev_env_py3: build: context: ./ dockerfile: Dockerfile volumes: - ..:/codes/ stdin_open: true tty: true command: bash -c "source /opt/conda/bin/activate p37 && pip install -r requirements_py3.txt" ================================================ FILE: test/docker-compose.yml ================================================ # 開発/test環境としてまとめてdocker環境を整えるためのcompose version: '3' services: test_env: build: context: ./ dockerfile: Dockerfile volumes: - ..:/codes/ stdin_open: true tty: true command: bash -c "juman -S && source /opt/conda/bin/activate p37 && cd /codes/ && python setup.py test && source deactivate && echo 'Python3 test done' && source /opt/conda/bin/activate p27 && cd /codes/ && python setup.py test && echo 'Python2 test done'" ================================================ FILE: test/requirements_py2.txt ================================================ pypandoc future six jaconv>=0.2 pip>=8.1.0 pexpect pyknp>=0.4.1 mecab-python typing neologdn kytea ================================================ FILE: test/requirements_py3.txt ================================================ pypandoc future six jaconv>=0.2 pip>=8.1.0 pexpect pyknp mecab-python3 neologdn kytea ================================================ FILE: test/resources/test/userdict.csv ================================================ さくらまな,-1,-1,-400,名詞,一般,*,*,*,*,さくらまな,*,*,* ================================================ FILE: test/test_all.py ================================================ __author__ = 'kensuke-mi' import sys import unittest import six python_version = sys.version_info def suite(): suite = unittest.TestSuite() if six.PY3: from .test_filter_python3 import TestFilter from .test_mecab_wrapper_python3 import TestMecabWrapperPython3 from .test_kytea_wrapper_python3 import TestKyteaWrapperPython3 from .test_juman_wrapper_python3 import TestJumanWrapperPython3 suite.addTest(unittest.makeSuite(TestFilter)) suite.addTest(unittest.makeSuite(TestKyteaWrapperPython3)) suite.addTest(unittest.makeSuite(TestMecabWrapperPython3)) suite.addTest(unittest.makeSuite(TestJumanWrapperPython3)) elif six.PY2: from .test_filter_python2 import TestFilter from .test_mecab_wrapper_python2 import TestMecabWrapperPython2 from .test_juman_wrapper_python2 import TestJumanWrapperPython2 from .test_kytea_wrapper_python2 import TestKyteaWrapperPython2 suite.addTest(unittest.makeSuite(TestFilter)) suite.addTest(unittest.makeSuite(TestKyteaWrapperPython2)) suite.addTest(unittest.makeSuite(TestMecabWrapperPython2)) suite.addTest(unittest.makeSuite(TestJumanWrapperPython2)) return suite def suite_with_jumanpp(): suite_obj = suite() if six.PY3: from .test_jumanpp_wrapper_python3 import TestJumanppWrapperPython3 suite_obj.addTest(suite_obj.addTest(unittest.makeSuite(TestJumanppWrapperPython3))) elif six.PY2: from .test_jumanpp_wrapper_python2 import TestJumanppWrapperPython2 suite_obj.addTest(suite_obj.addTest(unittest.makeSuite(TestJumanppWrapperPython2))) return suite_obj ================================================ FILE: test/test_filter_python2.py ================================================ #! -*- coding: utf-8 -*- import sys import unittest from JapaneseTokenizer.mecab_wrapper import MecabWrapper from JapaneseTokenizer.datamodels import TokenizedSenetence, FilteredObject, TokenizedResult import os __author__ = 'kensuke-mi' class TestFilter(unittest.TestCase): def setUp(self): '''紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優みたいだ。''' self.test_senetence = u'紗倉 まなは、日本のAV女優みたいで、うつくしい。\nそこで、ぼくはその1枚のはなやかな作品を見たいと思った。' self.stopword = ['AV'] self.pos_condition = [('名詞', '一般',), ('名詞', '固有名詞'), ('形容詞', '自立',), ('助詞', '格助詞', '引用')] self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv') def test_filtering(self): mecab_obj = MecabWrapper(dictType='ipadic') tokenized_sentence = mecab_obj.tokenize(sentence=self.test_senetence,is_feature=True).\ filter(pos_condition=self.pos_condition, stopwords=self.stopword) assert isinstance(tokenized_sentence, TokenizedSenetence) seq_except_pos = [(u'動詞',), (u'名詞', u'代名詞'), (u'名詞', u'接尾')] seq_match_pos = [(u'名詞',), (u'名詞', u'固有名詞',), (u'形容詞',), (u'形容詞', u'自立'),(u'助詞', u'格助詞', u'引用')] for token_obj in tokenized_sentence.tokenized_objects: assert isinstance(token_obj, TokenizedResult) pos_tuple = token_obj.tuple_pos # 結果に入っているべきではない品詞 # for except_pos in seq_except_pos: self.assertTrue(not set(except_pos).issubset(set(pos_tuple))) # 結果に入っているべき品詞 # bool_any = any(set(match_pos).issubset(set(pos_tuple)) for match_pos in seq_match_pos) self.assertTrue(bool_any) if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_filter_python3.py ================================================ #! -*- coding: utf-8 -*- import sys import unittest from JapaneseTokenizer.mecab_wrapper import MecabWrapper from JapaneseTokenizer.datamodels import TokenizedSenetence, FilteredObject, TokenizedResult import os __author__ = 'kensuke-mi' class TestFilter(unittest.TestCase): def setUp(self): '''紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優みたいだ。''' self.test_senetence = '紗倉 まなは、日本のAV女優みたいで、うつくしい。そこで、ぼくはその1枚のはなやかな作品を見たいと思った。' self.stopword = ['AV', '女優'] self.pos_condition = [('名詞', '一般',), ('名詞', '固有名詞'), ('形容詞', '自立',), ('助詞', '格助詞', '引用')] self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv') def test_filtering(self): mecab_obj = MecabWrapper(dictType='ipadic') tokenized_sentence = mecab_obj.tokenize(sentence=self.test_senetence,is_feature=True).\ filter(pos_condition=self.pos_condition, stopwords=self.stopword) assert isinstance(tokenized_sentence, TokenizedSenetence) seq_except_pos = [('動詞',), ('名詞', '代名詞'), ('名詞', '接尾')] seq_match_pos = [('名詞',), ('名詞', '固有名詞',), ('形容詞',), ('形容詞', '自立'),('助詞', '格助詞', '引用')] for token_obj in tokenized_sentence.tokenized_objects: assert isinstance(token_obj, TokenizedResult) pos_tuple = token_obj.tuple_pos # 結果に入っているべきではない品詞 # for except_pos in seq_except_pos: self.assertTrue(not set(except_pos).issubset(set(pos_tuple))) # 結果に入っているべき品詞 # bool_any = any(set(match_pos).issubset(set(pos_tuple)) for match_pos in seq_match_pos) self.assertTrue(bool_any) # stopwordsのチェック self.assertTrue(token_obj.word_stem not in self.stopword) if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_juman_wrapper_python2.py ================================================ #-*- encoding: utf-8 -*- # this test file does not work under pycharm # do your test with command line from __future__ import absolute_import from __future__ import division from future.utils import string_types, text_type from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject from JapaneseTokenizer.juman_wrapper import JumanWrapper import pyknp import unittest import sys import codecs import logging sys.stdin = codecs.getreader('utf_8')(sys.stdin) sys.stdout = codecs.getwriter('utf_8')(sys.stdout) logger = logging.getLogger(__file__) logger.level = logging.INFO class TestJumanWrapperPython2(unittest.TestCase): def setUp(self): pass def test_juman_wrapper(self): try: from pyknp import Juman juman = Juman(command='juman', jumanpp=False) result = juman.analysis(u"これはペンです。") logger.debug(','.join(mrph.midasi for mrph in result)) for mrph in result.mrph_list(): assert isinstance(mrph, pyknp.Morpheme) logger.debug(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \ % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname)) except ImportError: logger.debug('skip test_juman_wrapper') def test_tokenize(self): """This test case checks juman_wrapper.tokenize """ logger.debug (u'Tokenize Test') test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper() token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) assert isinstance(token_objects, TokenizedSenetence) for t_obj in token_objects.tokenized_objects: assert isinstance(t_obj, TokenizedResult) logger.debug(u"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format( t_obj.word_surface, t_obj.word_stem, ' '.join(t_obj.tuple_pos), t_obj.misc_info )) assert isinstance(t_obj.word_surface, string_types) assert isinstance(t_obj.word_stem, string_types) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) token_objects_list = token_objects.convert_list_object() assert isinstance(token_objects_list, list) logger.debug('-'*30) for stem_posTuple in token_objects_list: assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, string_types) assert isinstance(word_posTuple, tuple) logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple))) def test_filter_pos(self): """ """ logger.debug (u'Filtering Test. POS condition is only 名詞') test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper() token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True ) pos_condition = [(u'名詞', )] filtered_result = juman_wrapper.filter( parsed_sentence=token_objects, pos_condition=pos_condition ) assert isinstance(filtered_result, FilteredObject) for t_obj in filtered_result.tokenized_objects: assert isinstance(t_obj, TokenizedResult) logger.debug(u"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format( t_obj.word_surface, t_obj.word_stem, ' '.join(t_obj.tuple_pos), t_obj.misc_info )) assert isinstance(t_obj.word_surface, string_types) assert isinstance(t_obj.word_stem, string_types) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) assert t_obj.tuple_pos[0] == u'名詞' logger.debug('-'*30) for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, string_types) assert isinstance(word_posTuple, tuple) logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple))) def test_stopwords(self): stopword = [u'AV', u'女優'] logger.debug (u'Stopwords Filtering Test. Stopwords is {}'.format(u','.join(stopword))) test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper() token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True ) filtered_result = juman_wrapper.filter( parsed_sentence=token_objects, stopwords=stopword ) check_flag = True for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, string_types) assert isinstance(word_posTuple, tuple) logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple))) if word_stem in stopword: check_flag = False assert check_flag def test_juman_server_mode(self): ### test with server mode ### ### Attention: this method causes Error if you don't start JUMAN SERVER mode ### test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(server='localhost', port=32000) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) self.assertTrue(isinstance(token_objects, TokenizedSenetence)) list_tokens = juman_wrapper.tokenize(sentence=test_sentence, return_list=True, is_feature=True) self.assertTrue(isinstance(list_tokens, list)) if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_juman_wrapper_python3.py ================================================ #-*- encoding: utf-8 -*- # this test file does not work under pycharm # do your test with command line from pyknp import Juman from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject from JapaneseTokenizer.juman_wrapper import JumanWrapper import pyknp import unittest import os import logging import socket logger = logging.getLogger(__file__) logger.level = logging.INFO class TestJumanWrapperPython3(unittest.TestCase): def setUp(self): # this is under MacOSX10 self.path_to_juman_command = '/usr/local/bin/juman' if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'juman' def test_juman_wrapper(self): try: juman = Juman(command=self.path_to_juman_command) result = juman.analysis("これはペンです。") logger.debug(','.join(mrph.midasi for mrph in result)) for mrph in result.mrph_list(): assert isinstance(mrph, pyknp.Morpheme) logger.debug("見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \ % (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname)) except ImportError: print('skip test_juman_wrapper') def test_tokenize(self): """This test case checks juman_wrapper.tokenize """ logger.debug('Tokenize Test') test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) assert isinstance(token_objects, TokenizedSenetence) for t_obj in token_objects.tokenized_objects: assert isinstance(t_obj, TokenizedResult) logger.debug("word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format( t_obj.word_surface, t_obj.word_stem, ' '.join(t_obj.tuple_pos), t_obj.misc_info )) assert isinstance(t_obj.word_surface, str) assert isinstance(t_obj.word_stem, str) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) token_objects_list = token_objects.convert_list_object() assert isinstance(token_objects_list, list) logger.debug('-'*30) for stem_posTuple in token_objects_list: assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple))) def test_filter_pos(self): """POS filteringのテスト """ logger.debug('Filtering Test. POS condition is only 名詞') test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) pos_condition = [('名詞', )] filtered_result = juman_wrapper.filter( parsed_sentence=token_objects, pos_condition=pos_condition ) assert isinstance(filtered_result, FilteredObject) for t_obj in filtered_result.tokenized_objects: assert isinstance(t_obj, TokenizedResult) logger.debug("word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format( t_obj.word_surface, t_obj.word_stem, ' '.join(t_obj.tuple_pos), t_obj.misc_info )) assert isinstance(t_obj.word_surface, str) assert isinstance(t_obj.word_stem, str) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) assert t_obj.tuple_pos[0] == '名詞' logger.debug('-'*30) for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple))) def test_stopwords(self): """stopword除去のテスト""" stopword = ['AV', '女優'] logger.debug ('Stopwords Filtering Test. Stopwords is {}'.format(','.join(stopword))) test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True ) filtered_result = juman_wrapper.filter( parsed_sentence=token_objects, stopwords=stopword ) check_flag = True for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple))) if word_stem in stopword: check_flag = False assert check_flag def test_juman_severmode(self): """* What you can do - juman server modeのテストを実施する """ logger.debug('Tokenize test with server mode') test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" # check socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 32000 try: s.connect((HOST, PORT)) s.close() except: logger.warning("SKip server mode test because server is not working.") else: juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT) token_objects = juman_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True) assert isinstance(token_objects, TokenizedSenetence) test_sentence = "ペルシア語(ペルシアご、ペルシア語: فارسی‌‎, پارسی‌; Fārsī, Pārsī)は、イランを中心とする中東地域で話される言語。" juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT) list_token = juman_wrapper.tokenize(sentence=test_sentence, return_list=True, is_feature=True) assert isinstance(list_token, list) if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_jumanpp_wrapper_python2.py ================================================ #-*- encoding: utf-8 -*- # this test file does not work under pycharm # do your test with command line from pyknp import Juman from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject from JapaneseTokenizer.jumanpp_wrapper.jumanpp_wrapper import JumanppWrapper, JumanppClient from JapaneseTokenizer.common.sever_handler import JumanppHnadler import pyknp import socket import unittest import os import logging logger = logging.getLogger(__file__) logger.level = logging.INFO class TestJumanppWrapperPython2(unittest.TestCase): def setUp(self): # this is under MacOSX10 self.path_to_juman_command = '/usr/local/bin/jumanpp' if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'jumanpp' def test_JumanppClient(self): test_sentence = u'外国人参政権を欲しい。' s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 12000 try: s.connect((HOST, PORT)) s.close() except: logger.warning("SKip server mode test because server is not working.") else: client_obj = JumanppClient(hostname='localhost', port=12000) res = client_obj.query(sentence=test_sentence, pattern=r'EOS') del res def test_jumanpp_servermode(self): ### test with list return object ### test_sentence = u'外国人参政権を欲しい。' s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 12000 try: s.connect((HOST, PORT)) s.close() except: logger.warning("SKip server mode test because server is not working.") else: jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000) list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True) assert isinstance(list_tokens, list) ### test with TokenizedSenetence return object ### tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) ### test with TokenizedSenetence return object and filter by chain expression ### pos_condtion = [('名詞', )] filtered_res = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False).filter(pos_condition=pos_condtion) assert isinstance(filtered_res, FilteredObject) assert isinstance(filtered_res.convert_list_object(), list) def test_jumanpp_servermode_stress(self): ### test with severmode with much stress ### test_sentence = u'外国人参政権を欲しい。' s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 12000 try: s.connect((HOST, PORT)) s.close() except: logger.warning("SKip server mode test because server is not working.") else: jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000) for i in range(0, 1000): list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True) assert isinstance(list_tokens, list) assert u'外国' in test_sentence del jumanpp_tokenizer def test_jumanpp_localmode_pyexpect(self): test_sentence = u'外国人参政権を欲しい。' jumanpp_tokenizer = JumanppWrapper(command=self.path_to_juman_command, is_use_pyknp=False) self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler)) list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True) assert isinstance(list_tokens, list) jumanpp_tokenizer = JumanppWrapper(command=self.path_to_juman_command, is_use_pyknp=False) self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler)) tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) def test_jumanpp_huge_amount_text(self): """pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動""" logger.info('under testing of processing huge amount of text...') seq_test_sentence = [u'外国人参政権を欲しい。'] * 500 jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command) self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler)) for i, test_s in enumerate(seq_test_sentence): tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s) self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence)) if not i == 0 and i % 100 == 0: """強制的にプロセスを殺して再起動""" logger.info('It forces stop unix process.') jumanpp_tokenizer.jumanpp_obj.restart_process() else: pass if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_jumanpp_wrapper_python3.py ================================================ #-*- encoding: utf-8 -*- # this test file does not work under pycharm # do your test with command line from pyknp import Juman from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject from JapaneseTokenizer.jumanpp_wrapper.jumanpp_wrapper import JumanppWrapper, JumanppClient from JapaneseTokenizer.common.sever_handler import JumanppHnadler import pyknp import unittest import os import logging import socket logger = logging.getLogger(__file__) logger.level = logging.INFO class TestJumanppWrapperPython3(unittest.TestCase): def setUp(self): # this is under MacOSX10 self.path_to_juman_command = '/usr/local/bin/jumanpp' if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'jumanpp' def test_JumanppClient(self): test_sentence = '外国人参政権を欲しい。' # check socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 12000 try: s.connect((HOST, PORT)) s.close() except: logger.warning("SKip server mode test because server is not working.") else: jumanpp_tokenizer = JumanppWrapper(server=HOST, port=PORT) client_obj = JumanppClient(hostname='localhost', port=12000) res = client_obj.query(sentence=test_sentence, pattern=rb'EOS') del res def test_jumanpp_servermode(self): ### test with list return object ### test_sentence = '外国人参政権を欲しい。' # check socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 12000 try: s.connect((HOST, PORT)) s.close() except: logger.warning(msg='SKip server mode test because server is not working.') else: jumanpp_tokenizer = JumanppWrapper(server=HOST, port=PORT) list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True) assert isinstance(list_tokens, list) ### test with TokenizedSenetence return object ### tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) ### test with TokenizedSenetence return object and filter by chain expression ### pos_condtion = [('名詞',)] filtered_res = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False).filter( pos_condition=pos_condtion) assert isinstance(filtered_res, FilteredObject) assert isinstance(filtered_res.convert_list_object(), list) def test_jumanpp_servermode_stress(self): ### test with severmode with much stress ### test_sentence = '外国人参政権を欲しい。' # check socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) HOST = 'localhost' PORT = 12000 try: s.connect((HOST, PORT)) s.close() except: logger.warning(msg='SKip server mode test because server is not working.') else: jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000) for i in range(0, 1000): list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True) assert isinstance(list_tokens, list) assert '外国' in test_sentence del jumanpp_tokenizer def test_jumanpp_localmode_pyexpect(self): """pexpectを使ったプロセス呼び出しのテスト""" test_sentence = '外国人参政権を欲しい。' jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command) self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler)) list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True) assert isinstance(list_tokens, list) jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command) self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler)) tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False) assert isinstance(tokenized_obj, TokenizedSenetence) def test_jumanpp_huge_amount_text(self): """pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動""" logger.info('under testing of processing huge amount of text...') seq_test_sentence = ['外国人参政権を欲しい。'] * 500 jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command) self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler)) for i, test_s in enumerate(seq_test_sentence): tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s) self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence)) if not i == 0 and i % 100 == 0: """強制的にプロセスを殺して再起動""" logger.info('It forces stop unix process.') jumanpp_tokenizer.jumanpp_obj.restart_process() else: pass if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_kytea_wrapper_python2.py ================================================ # -*- coding: utf-8 -*- from JapaneseTokenizer.kytea_wrapper import KyteaWrapper from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject import unittest class TestKyteaWrapperPython2(unittest.TestCase): def setUp(self): pass def test_tokenization(self): input_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" kytea_wrapper = KyteaWrapper() tokenized_result = kytea_wrapper.tokenize( sentence=input_sentence, normalize=True, return_list=False, is_feature=True ) assert isinstance(tokenized_result, TokenizedSenetence) for t_obj in tokenized_result.tokenized_objects: assert isinstance(t_obj, TokenizedResult) print('-'*30) tokenized_result_list = tokenized_result.convert_list_object() assert isinstance(tokenized_result_list, list) for t_obj_tuple in tokenized_result_list: assert isinstance(t_obj_tuple, tuple) def test_filter_pos(self): """ """ print (u'Filtering Test. POS condition is only 名詞') test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" kytea_wrapper = KyteaWrapper() tokenized_result = kytea_wrapper.tokenize( sentence=test_sentence, normalize=True, return_list=False, is_feature=True ) pos_condition = [(u'名詞', )] filtered_result = kytea_wrapper.filter( parsed_sentence=tokenized_result, pos_condition=pos_condition ) assert isinstance(filtered_result, FilteredObject) for t_obj in filtered_result.tokenized_objects: assert isinstance(t_obj, TokenizedResult) assert isinstance(t_obj.word_surface, unicode) assert isinstance(t_obj.word_stem, unicode) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) assert t_obj.tuple_pos[0] == u'名詞' print('-'*30) for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, unicode) assert isinstance(word_posTuple, tuple) def test_stopwords(self): stopword = [u'女優'] print (u'Stopwords Filtering Test. Stopwords is {}'.format(u','.join(stopword))) test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" kytea_wrapper = KyteaWrapper() token_objects = kytea_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True ) filtered_result = kytea_wrapper.filter( parsed_sentence=token_objects, stopwords=stopword ) check_flag = True print('-'*30) for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, unicode) assert isinstance(word_posTuple, tuple) if word_stem in stopword: check_flag = False assert check_flag if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_kytea_wrapper_python3.py ================================================ # -*- coding: utf-8 -*- from JapaneseTokenizer.kytea_wrapper import KyteaWrapper from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject import unittest class TestKyteaWrapperPython3(unittest.TestCase): def setUp(self): pass def test_tokenization(self): input_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" kytea_wrapper = KyteaWrapper() tokenized_result = kytea_wrapper.tokenize( sentence=input_sentence, normalize=True, return_list=False, is_feature=True ) assert isinstance(tokenized_result, TokenizedSenetence) for t_obj in tokenized_result.tokenized_objects: assert isinstance(t_obj, TokenizedResult) #print('-'*30) tokenized_result_list = tokenized_result.convert_list_object() assert isinstance(tokenized_result_list, list) for t_obj_tuple in tokenized_result_list: assert isinstance(t_obj_tuple, tuple) def test_filter_pos(self): """ """ # 'Filtering Test. POS condition is only 名詞') test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" kytea_wrapper = KyteaWrapper() tokenized_result = kytea_wrapper.tokenize( sentence=test_sentence, normalize=True, return_list=False, is_feature=True ) pos_condition = [('名詞', )] filtered_result = kytea_wrapper.filter( parsed_sentence=tokenized_result, pos_condition=pos_condition ) assert isinstance(filtered_result, FilteredObject) for t_obj in filtered_result.tokenized_objects: assert isinstance(t_obj, TokenizedResult) assert isinstance(t_obj.word_surface, str) assert isinstance(t_obj.word_stem, str) assert isinstance(t_obj.tuple_pos, tuple) assert isinstance(t_obj.misc_info, dict) assert t_obj.tuple_pos[0] == '名詞' for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) def test_stopwords(self): stopword = ['女優'] # ('Stopwords Filtering Test. Stopwords is {}'.format(','.join(stopword))) test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。" kytea_wrapper = KyteaWrapper() token_objects = kytea_wrapper.tokenize(sentence=test_sentence, return_list=False, is_feature=True ) filtered_result = kytea_wrapper.filter( parsed_sentence=token_objects, stopwords=stopword ) check_flag = True for stem_posTuple in filtered_result.convert_list_object(): assert isinstance(stem_posTuple, tuple) word_stem = stem_posTuple[0] word_posTuple = stem_posTuple[1] assert isinstance(word_stem, str) assert isinstance(word_posTuple, tuple) if word_stem in stopword: check_flag = False assert check_flag if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_mecab_wrapper_python2.py ================================================ #! -*- coding: utf-8 -*- __author__ = 'kensuke-mi' import sys import unittest from JapaneseTokenizer.mecab_wrapper.mecab_wrapper import MecabWrapper from JapaneseTokenizer.datamodels import TokenizedSenetence from six import string_types import os python_version = sys.version_info class TestMecabWrapperPython2(unittest.TestCase): def setUp(self): self.test_senetence = u'紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優。' self.test_sentence2 = u'午前零時。午前3時。3時。' self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv') def test_neologd_parse(self): """* Test case - neologd辞書で正しく分割できることを確認する """ mecab_obj = MecabWrapper(dictType='neologd') parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence) self.assertTrue(parsed_obj, TokenizedSenetence) self.assertTrue(isinstance(parsed_obj.convert_list_object(), list)) self.assertTrue(all(isinstance(mrph, string_types) for mrph in parsed_obj.convert_list_object())) parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2) self.assertTrue(parsed_obj, TokenizedSenetence) self.assertTrue(isinstance(parsed_obj.convert_list_object(), list)) self.assertTrue(all(isinstance(mrph, string_types) for mrph in parsed_obj.convert_list_object())) def test_default_parse(self): """* Test case - デフォルトの状態で動作を確認する """ dictType = "ipadic" mecab_obj = MecabWrapper(dictType=dictType) assert isinstance(mecab_obj, MecabWrapper) parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True) assert isinstance(parsed_obj, list) if python_version >= (3, 0, 0): for morph in parsed_obj: assert isinstance(morph, str) else: for morph in parsed_obj: assert isinstance(morph, string_types) def test_init_userdict(self): # test when user dictionary is called mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict) assert isinstance(mecab_obj, MecabWrapper) parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True) is_ok = False for morph in parsed_obj: if u'さくらまな' == morph: is_ok = True else: pass assert is_ok def test_parse_jumandic(self): with self.assertRaises(Exception): mecab_obj = MecabWrapper(dictType='jumandic') assert isinstance(mecab_obj, MecabWrapper) def test_init_alldict(self): """* Test case - すべての辞書を利用した場合の動作を確認する """ with self.assertRaises(Exception): mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict) assert isinstance(mecab_obj, MecabWrapper) if __name__ == '__main__': unittest.main() ================================================ FILE: test/test_mecab_wrapper_python3.py ================================================ #! -*- coding: utf-8 -*- __author__ = 'kensuke-mi' import sys import unittest from JapaneseTokenizer.mecab_wrapper.mecab_wrapper import MecabWrapper from JapaneseTokenizer.datamodels import TokenizedSenetence import os python_version = sys.version_info class TestMecabWrapperPython3(unittest.TestCase): def setUp(self): self.test_senetence = '紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優。' self.test_sentence2 = '午前零時。午前3時。3時。' self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv') def test_neologd_parse(self): # test using neologd dictionary mecab_obj = MecabWrapper(dictType='neologd') parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence) self.assertTrue(parsed_obj, TokenizedSenetence) self.assertTrue(isinstance(parsed_obj.convert_list_object(), list)) self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object())) parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2) self.assertTrue(parsed_obj, TokenizedSenetence) self.assertTrue(isinstance(parsed_obj.convert_list_object(), list)) self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object())) def test_default_parse(self): # test default status dictType = "ipadic" mecab_obj = MecabWrapper(dictType=dictType) assert isinstance(mecab_obj, MecabWrapper) parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True) assert isinstance(parsed_obj, list) for morph in parsed_obj: assert isinstance(morph, str) parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2, return_list=True) assert isinstance(parsed_obj, list) for morph in parsed_obj: assert isinstance(morph, str) def test_parse_jumandic(self): mecab_obj = MecabWrapper(dictType='jumandic') assert isinstance(mecab_obj, MecabWrapper) parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False) assert isinstance(parsed_obj, TokenizedSenetence) for tokenized_obj in parsed_obj.tokenized_objects: if tokenized_obj.word_stem == '女優': # ドメイン:文化・芸術 is special output only in Jumandic assert 'ドメイン:文化・芸術' in tokenized_obj.analyzed_line def test_parse_userdic(self): pass def test_parse_dictionary_path(self): # put path to dictionary and parse sentence. path_default_ipadic = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd' if os.path.exists(path_default_ipadic): mecab_obj = MecabWrapper(dictType=None, path_dictionary=path_default_ipadic) assert mecab_obj._path_dictionary == path_default_ipadic parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False) assert isinstance(parsed_obj, TokenizedSenetence) def test_init_userdict(self): # this test should be error response. mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict) assert isinstance(mecab_obj, MecabWrapper) parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False) assert isinstance(parsed_obj, TokenizedSenetence) is_ok = False for tokenized_obj in parsed_obj.tokenized_objects: if tokenized_obj.word_stem == 'さくらまな': is_ok = True assert is_ok if __name__ == '__main__': unittest.main() ================================================ FILE: travis-mecab-install.sh ================================================ #!/bin/bash # from https://gist.github.com/dtan4/351d031bec0c3d45cd8f # see also http://qiita.com/dtan4/items/c6a087666296fbd5fffb base_dir=`pwd` wget -O mecab-0.996.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE' tar zxfv mecab-0.996.tar.gz cd mecab-0.996 ./configure --enable-utf8-only make make check sudo make install sudo ldconfig cd $base_dir wget -O mecab-ipadic-2.7.0-20070801.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM' tar zxfv mecab-ipadic-2.7.0-20070801.tar.gz cd mecab-ipadic-2.7.0-20070801 ./configure --with-charset=utf8 make sudo make install sudo ldconfig wget -O jumandic.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM' tar zxfv jumandic.tar.gz cd mecab-jumandic-7.0-20130310 ./configure --with-charset=utf8 make sudo make install sudo ldconfig cd $base_dir rm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801 mecab-jumandic-7.0-20130310