Repository: Kensuke-Mitsuzawa/JapaneseTokenizers
Branch: master
Commit: 3bdfb6be73de
Files: 51
Total size: 140.3 KB
Directory structure:
gitextract_jdhxzz3y/
├── .gitignore
├── .travis.yml
├── JapaneseTokenizer/
│ ├── __init__.py
│ ├── common/
│ │ ├── __init__.py
│ │ ├── juman_utils.py
│ │ ├── sever_handler.py
│ │ ├── text_preprocess.py
│ │ └── timeout_handler.py
│ ├── datamodels.py
│ ├── init_logger.py
│ ├── juman_wrapper/
│ │ ├── __init__.py
│ │ └── juman_wrapper.py
│ ├── jumanpp_wrapper/
│ │ ├── __init__.py
│ │ └── jumanpp_wrapper.py
│ ├── kytea_wrapper/
│ │ ├── __init__.py
│ │ └── kytea_wrapper.py
│ ├── mecab_wrapper/
│ │ ├── __init__.py
│ │ └── mecab_wrapper.py
│ └── object_models.py
├── LICENSE.txt
├── MANIFEST.in
├── Makefile
├── README.md
├── examples/
│ ├── examples.py
│ ├── userdict.csv
│ └── userdict.dict
├── install_tokenizers.sh
├── setup.py
├── test/
│ ├── Dockerfile
│ ├── Dockerfile-dev
│ ├── __init__.py
│ ├── common/
│ │ ├── __init__.py
│ │ └── test_server_handler.py
│ ├── docker-compose-dev.yml
│ ├── docker-compose.yml
│ ├── requirements_py2.txt
│ ├── requirements_py3.txt
│ ├── resources/
│ │ └── test/
│ │ ├── userdict.csv
│ │ └── userdict.dict
│ ├── test_all.py
│ ├── test_filter_python2.py
│ ├── test_filter_python3.py
│ ├── test_juman_wrapper_python2.py
│ ├── test_juman_wrapper_python3.py
│ ├── test_jumanpp_wrapper_python2.py
│ ├── test_jumanpp_wrapper_python3.py
│ ├── test_kytea_wrapper_python2.py
│ ├── test_kytea_wrapper_python3.py
│ ├── test_mecab_wrapper_python2.py
│ └── test_mecab_wrapper_python3.py
└── travis-mecab-install.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.idea/
JapaneseTokenizer.egg-info/
build/
dist/
*eggs/
pyknp.egg-info/
.python-version
*pyc
morphogySplitters/
Mykytea-python/
.DS_Store
*tox
.cache/
python/
python2/
================================================
FILE: .travis.yml
================================================
language: python
python:
- 2.7
- 3.5
addons:
apt:
packages:
- git
- make
- curl
- xz-utils
- file
- pandoc
- libboost-all-dev
- language-pack-ja-base
- language-pack-ja
- ibus-mozc
- gcc-5
- g++-5
- build-essential
- swig
sources:
- ubuntu-toolchain-r-test
before_install:
- sudo apt-get update -qq
- sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1
- sudo update-locale LANG=ja_JP.UTF-8 LANGUAGE="ja_JP:ja"
- mkdir ./target
- export CC="gcc-5"
- export CXX="g++-5"
- export CFLAGS=-std=c++11
- export CXXFLAGS=-std=c++11
- sudo bash travis-mecab-install.sh
- which mecab-config
- sudo make install
- git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git
- cd mecab-ipadic-neologd && echo yes | sudo ./bin/install-mecab-ipadic-neologd && cd ../
- sudo juman -S
install:
- python --version
- python setup.py install
- pip install coveralls coverage nose
script:
- coverage run --source=JapaneseTokenizer setup.py test
after_success:
- coveralls
notifications:
email:
recipients:
- kensuke.mit@gmail.com
on_success: always
on_failure: always
================================================
FILE: JapaneseTokenizer/__init__.py
================================================
from JapaneseTokenizer.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.juman_wrapper import JumanWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence
from JapaneseTokenizer.datamodels import FilteredObject
from JapaneseTokenizer.kytea_wrapper import KyteaWrapper
from JapaneseTokenizer.jumanpp_wrapper import JumanppWrapper
================================================
FILE: JapaneseTokenizer/common/__init__.py
================================================
__author__ = 'kensuke-mi'
================================================
FILE: JapaneseTokenizer/common/juman_utils.py
================================================
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence
from typing import Tuple
import pyknp
from six import text_type
"""These functions are for utilization of Juman"""
def extract_morphological_information(mrph_object, is_feature, is_surface):
# type: (pyknp.Morpheme, bool, bool) -> TokenizedResult
"""This method extracts morphlogical information from token object.
"""
assert isinstance(mrph_object, pyknp.Morpheme)
assert isinstance(is_feature, bool)
assert isinstance(is_surface, bool)
surface = mrph_object.midasi
word_stem = mrph_object.genkei
tuple_pos = (mrph_object.hinsi, mrph_object.bunrui)
misc_info = {
'katuyou1': mrph_object.katuyou1,
'katuyou2': mrph_object.katuyou2,
'imis': mrph_object.imis,
'repname': mrph_object.repname
}
token_object = TokenizedResult(
node_obj=None,
tuple_pos=tuple_pos,
word_stem=word_stem,
word_surface=surface,
is_feature=is_feature,
is_surface=is_surface,
misc_info=misc_info
)
return token_object
def feature_parser(uni_feature, word_surface):
# type: (text_type, text_type) -> Tuple[Tuple[text_type, text_type, text_type], text_type]
"""
Parse the POS feature output by Mecab
:param uni_feature unicode:
:return ( (pos1, pos2, pos3), word_stem ):
"""
list_feature_items = uni_feature.split(',')
# if word has no feature at all
if len(list_feature_items) == 1: return ('*'), ('*')
pos1 = list_feature_items[0]
pos2 = list_feature_items[1]
pos3 = list_feature_items[2]
tuple_pos = (pos1, pos2, pos3)
# if without constraint(output is normal mecab dictionary like)
if len(list_feature_items) == 9:
word_stem = list_feature_items[6]
# if with constraint(output format depends on Usedict.txt)
else:
word_stem = word_surface
return tuple_pos, word_stem
================================================
FILE: JapaneseTokenizer/common/sever_handler.py
================================================
#! -*- coding: utf-8 -*-
import subprocess
from subprocess import Popen, PIPE, STDOUT
import multiprocessing
# socket object
import socket
# logger
from JapaneseTokenizer import init_logger
import logging
# typing
from typing import Union
# else
from six import text_type
import six
import pexpect
import shutil
import signal
import os
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
class ProcessDownException(Exception):
pass
class UnixProcessHandler(object):
def __init__(self,
command,
option=None,
pattern='EOS',
timeout_second=10):
# type: (text_type,text_type,text_type,int)->None
"""* Get communication with unix process using pexpect module."""
self.command = command
self.timeout_second = timeout_second
self.pattern = pattern
self.option = option
self.launch_process(command)
def __del__(self):
if hasattr(self, "process_analyzer"):
self.process_analyzer.kill(sig=9)
def launch_process(self, command):
# type: (Union[bytes,text_type])->None
"""* What you can do
- It starts process and keep it.
"""
if not self.option is None:
command_plus_option = self.command + " " + self.option
else:
command_plus_option = self.command
if six.PY3:
if shutil.which(command) is None:
raise Exception("No command at {}".format(command))
else:
self.process_analyzer = pexpect.spawnu(command_plus_option)
self.process_id = self.process_analyzer.pid
else:
doc_command_string = "echo '' | {}".format(command)
command_check = os.system(doc_command_string)
if not command_check == 0:
raise Exception("No command at {}".format(command))
else:
self.process_analyzer = pexpect.spawnu(command_plus_option)
self.process_id = self.process_analyzer.pid
def restart_process(self):
# type: ()->None
if not self.option is None:
command_plus_option = self.command + " " + self.option
else:
command_plus_option = self.command
self.process_analyzer.kill(sig=9)
self.process_analyzer = pexpect.spawnu(command_plus_option)
self.process_id = self.process_analyzer.pid
def stop_process(self):
# type: ()->bool
"""* What you can do
- You're able to stop the process which this instance has now.
"""
if hasattr(self, "process_analyzer"):
self.process_analyzer.kill(sig=9)
else:
pass
return True
def __query(self, input_string):
# type: (text_type)->text_type
"""* What you can do
- It takes the result of Juman++
- This function monitors time which takes for getting the result.
"""
signal.signal(signal.SIGALRM, self.__notify_handler)
signal.alarm(self.timeout_second)
self.process_analyzer.sendline(input_string)
buffer = ""
while True:
line_string = self.process_analyzer.readline() # type: text_type
if line_string.strip() == input_string:
"""Skip if process returns the same input string"""
continue
elif line_string.strip() == self.pattern:
buffer += line_string
signal.alarm(0)
return buffer
else:
buffer += line_string
def __notify_handler(self, signum, frame):
raise ProcessDownException("""It takes longer time than {time} seconds. You're able to try,
1. Change your setting of 'timeout_second' parameter
2. Run restart_process() method when the exception happens.""".format(**{"time": self.timeout_second}))
def query(self, input_string):
# type: (text_type)->text_type
return self.__query(input_string=input_string)
class JumanppHnadler(UnixProcessHandler):
def __init__(self,
jumanpp_command,
option = None,
pattern = 'EOS',
timeout_second = 10):
# type: (text_type,text_type,text_type,int)->None
super(JumanppHnadler, self).__init__(command=jumanpp_command, option=option, pattern=pattern, timeout_second=timeout_second)
def launch_jumanpp_process(self, command):
# type: (text_type)->None
return self.launch_process(command)
================================================
FILE: JapaneseTokenizer/common/text_preprocess.py
================================================
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from six import text_type
import jaconv
import six
import re
import unicodedata
from JapaneseTokenizer import init_logger
import logging
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
__author__ = 'kensuke-mi'
if six.PY2:
def u(str): return str.decode("utf-8")
def b(str): return str
pass
else: # python3
def u(str): return str
def b(str): return str.encode("utf-8")
pass
try:
import neologdn
is_neologdn_valid = True
except:
logger.warning("neologdn package is not installed yet. You could not call neologd dictionary.")
is_neologdn_valid = False
STRING_EXCEPTION = set([u('*')])
def denormalize_text(input_text):
# type: (text_type)->text_type
"""* What you can do
- It converts text into standard japanese writing way
* Note
- hankaku-katakana is to zenkaku-katakana
- zenkaku-eisu is to hankaku-eisu
"""
if input_text in STRING_EXCEPTION:
return input_text
else:
return jaconv.z2h(input_text, kana=False, ascii=True, digit=True)
def normalize_text(input_text,
dictionary_mode='ipadic',
new_line_replaced='。',
is_replace_eos=True,
is_kana=True,
is_ascii=True,
is_digit=True):
# type: (text_type,text_type,text_type,bool,bool,bool,bool)->text_type
"""* What you can do
- It converts input-text into normalized-text which is good for tokenizer input.
* Params
- new_line_replaced: a string which replaces from \n string.
"""
if is_replace_eos:
without_new_line = input_text.replace('\n', new_line_replaced)
else:
without_new_line = new_line_replaced
if dictionary_mode=='neologd' and is_neologdn_valid:
return neologdn.normalize(normalize_text_normal_ipadic(without_new_line))
elif dictionary_mode=='neologd' and is_neologdn_valid == False:
raise Exception("You could not call neologd dictionary bacause you do NOT install the package neologdn.")
else:
return normalize_text_normal_ipadic(without_new_line, kana=is_kana, ascii=is_ascii, digit=is_digit)
def normalize_text_normal_ipadic(input_text, kana=True, ascii=True, digit=True):
# type: (text_type,bool,bool,bool)->text_type
"""
* All hankaku Katanaka is converted into Zenkaku Katakana
* All hankaku English alphabet and numberc string are converted into Zenkaku one
"""
return jaconv.h2z(input_text, kana=kana, ascii=ascii, digit=digit)
================================================
FILE: JapaneseTokenizer/common/timeout_handler.py
================================================
#! -*- coding: utf-8 -*-
from functools import wraps
class TimeoutException(Exception):
pass
def handler_func(msg):
raise TimeoutException()
def on_timeout(limit, handler=handler_func, hint=None):
"""
指定した実行時間に終了しなかった場合、handlerをhint/limitを引数にして呼び出します
@on_timeout(limit=3600, handler=notify_func, hint=u'長い計算')
def long_time_function():
"""
def notify_handler(signum, frame):
handler("'%s' is not finished in %d second(s)." % (hint, limit))
def __decorator(function):
def __wrapper(*args, **kwargs):
import signal
signal.signal(signal.SIGALRM, notify_handler)
signal.alarm(limit)
result = function(*args, **kwargs)
signal.alarm(0)
return result
return wraps(function)(__wrapper)
return __decorator
================================================
FILE: JapaneseTokenizer/datamodels.py
================================================
#! -*- coding: utf-8 -*-
# normalize module #
from JapaneseTokenizer.common.text_preprocess import normalize_text, denormalize_text
# datemodels #
from MeCab import Node
# typing #
from typing import List, Union, Any, Tuple, Dict, Callable, Optional
from future.utils import text_type, string_types
import sys
import six
__author__ = 'kensuke-mi'
python_version = sys.version_info
def __is_sotpwords(token, stopwords):
"""This function filters out stopwords. If token is in stopwords list, return True; else return False
"""
if token in stopwords:
return True
else:
return False
def __is_valid_pos(pos_tuple, valid_pos):
# type: (Tuple[text_type,...],List[Tuple[text_type,...]])->bool
"""This function checks token's pos is with in POS set that user specified.
If token meets all conditions, Return True; else return False
"""
def is_valid_pos(valid_pos_tuple):
# type: (Tuple[text_type,...])->bool
length_valid_pos_tuple = len(valid_pos_tuple)
if valid_pos_tuple == pos_tuple[:length_valid_pos_tuple]:
return True
else:
return False
seq_bool_flags = [is_valid_pos(valid_pos_tuple) for valid_pos_tuple in valid_pos]
if True in set(seq_bool_flags):
return True
else:
return False
def filter_words(tokenized_obj, valid_pos, stopwords, check_field_name='stem'):
# type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type],text_type) -> FilteredObject
"""This function filter token that user don't want to take.
Condition is stopword and pos.
* Input
- valid_pos
- List of Tuple which has POS element to keep.
- Keep in your mind, each tokenizer has different POS structure.
>>> [('名詞', '固有名詞'), ('動詞', )]
- stopwords
- List of str, which you'd like to remove
>>> ['残念', '今日']
"""
assert isinstance(tokenized_obj, TokenizedSenetence)
assert isinstance(valid_pos, list)
assert isinstance(stopwords, list)
filtered_tokens = []
for token_obj in tokenized_obj.tokenized_objects:
assert isinstance(token_obj, TokenizedResult)
if check_field_name=='stem':
res_stopwords = __is_sotpwords(token_obj.word_stem, stopwords)
else:
res_stopwords = __is_sotpwords(token_obj.word_surface, stopwords)
res_pos_condition = __is_valid_pos(token_obj.tuple_pos, valid_pos)
# case1: only pos filtering is ON
if valid_pos != [] and stopwords == []:
if res_pos_condition: filtered_tokens.append(token_obj)
# case2: only stopwords filtering is ON
if valid_pos == [] and stopwords != []:
if res_stopwords is False: filtered_tokens.append(token_obj)
# case3: both condition is ON
if valid_pos != [] and stopwords != []:
if res_stopwords is False and res_pos_condition: filtered_tokens.append(token_obj)
filtered_object = FilteredObject(
sentence=tokenized_obj.sentence,
tokenized_objects=filtered_tokens,
pos_condition=valid_pos,
stopwords=stopwords
)
return filtered_object
class TokenizedResult(object):
def __init__(self,
node_obj,
tuple_pos,
word_stem,
word_surface,
is_feature=True,
is_surface=False,
misc_info=None,
analyzed_line=None):
# type: (Optional[Node], Tuple[text_type, ...], str, str, bool, bool, Optional[Dict[str, Any]], str)->None
assert isinstance(node_obj, (Node, type(None)))
assert isinstance(tuple_pos, (string_types, tuple))
assert isinstance(word_stem, (string_types))
assert isinstance(word_surface, text_type)
assert isinstance(misc_info, (type(None), dict))
self.node_obj = node_obj
self.word_stem = word_stem
self.word_surface = word_surface
self.is_surface = is_surface
self.is_feature = is_feature
self.misc_info = misc_info
self.analyzed_line = analyzed_line
if isinstance(tuple_pos, tuple):
self.tuple_pos = tuple_pos
elif isinstance(tuple_pos, string_types):
self.tuple_pos = ('*', )
else:
raise Exception('Error while parsing feature object. {}'.format(tuple_pos))
class TokenizedSenetence(object):
def __init__(self, sentence, tokenized_objects, string_encoding='utf-8'):
# type: (text_type, List[TokenizedResult], text_type)->None
"""* Parameters
- sentence: sentence
- tokenized_objects: list of TokenizedResult object
- string_encoding: Encoding type of string type. This option is used only under python2.x
"""
assert isinstance(sentence, text_type)
assert isinstance(tokenized_objects, list)
self.sentence = sentence
self.tokenized_objects = tokenized_objects
self.string_encoding = string_encoding
def __extend_token_object(self, token_object,
is_denormalize=True,
func_denormalizer=denormalize_text):
# type: (TokenizedResult,bool,Callable[[str],str])->Tuple
"""This method creates dict object from token object.
"""
assert isinstance(token_object, TokenizedResult)
if is_denormalize:
if token_object.is_feature == True:
if token_object.is_surface == True:
token = (func_denormalizer(token_object.word_surface), token_object.tuple_pos)
else:
token = (func_denormalizer(token_object.word_stem), token_object.tuple_pos)
else:
if token_object.is_surface == True:
token = func_denormalizer(token_object.word_surface)
else:
token = func_denormalizer(token_object.word_stem)
else:
if token_object.is_feature == True:
if token_object.is_surface == True:
token = (token_object.word_surface, token_object.tuple_pos)
else:
token = (token_object.word_stem, token_object.tuple_pos)
else:
if token_object.is_surface == True:
token = token_object.word_surface
else:
token = token_object.word_stem
return token
def convert_list_object(self,
is_denormalize=True,
func_denormalizer=denormalize_text):
# type: (bool,Callable[[str],str])->List[Union[str, Tuple[str,...]]]
"""* What you can do
- You extract string object from TokenizedResult object
* Args
- is_denormalize: boolen object. True; it makes denormalize string
- func_denormalizer: callable object. de-normalization function.
"""
sentence_in_list_obj = [
self.__extend_token_object(token_object,is_denormalize,func_denormalizer)
for token_object
in self.tokenized_objects
]
return sentence_in_list_obj
def __convert_string_type(self, p_c_tuple):
# type: (Tuple[text_type,...])->Tuple[text_type]
"""* What you can do
- it normalizes string types into str
"""
if not isinstance(p_c_tuple, tuple):
raise Exception('Pos condition expects tuple of string. However = {}'.format(p_c_tuple))
converted = [text_type] * len(p_c_tuple)
for i, pos_element in enumerate(p_c_tuple):
if six.PY2 and isinstance(pos_element, str):
"""str into unicode if python2.x"""
converted[i] = pos_element.decode(self.string_encoding)
elif six.PY2 and isinstance(pos_element, text_type):
converted[i] = pos_element
elif six.PY3:
converted[i] = pos_element
else:
raise Exception()
return tuple(converted)
def __check_pos_condition(self, pos_condistion):
# type: (List[Tuple[text_type, ...]])->List[Tuple[text_type, ...]]
"""* What you can do
- Check your pos condition
- It converts character type into unicode if python version is 2.x
"""
assert isinstance(pos_condistion, list)
return [self.__convert_string_type(p_c_tuple) for p_c_tuple in pos_condistion]
def filter(self,
pos_condition=None,
stopwords=None,
is_normalize=True,
func_normalizer=normalize_text,
check_field_name='stem'):
# type: (List[Tuple[text_type,...]], List[text_type], bool, Callable[[text_type], text_type],text_type)->FilteredObject
"""* What you can do
- It filters out token which does NOT meet the conditions (stopwords & part-of-speech tag)
- Under python2.x, pos_condition & stopwords are converted into unicode type.
* Parameters
- pos_condition: list of part-of-speech(pos) condition. The pos condition is tuple is variable length.
You can specify hierarchical structure of pos condition with variable tuple.
The hierarchy of pos condition follows definition of dictionary.
- For example, in mecab you can take words with 名詞 if ('名詞',)
- For example, in mecab you can take words with 名詞-固有名詞 if ('名詞', '固有名詞')
- stopwords: list of word which you would like to remove
- is_normalize: Boolean flag for normalize stopwords.
- func_normalizer: Function object for normalization. The function object must be the same one as when you use tokenize.
- check_field_name: Put field name to check if stopword or NOT. Kytea does not have stem form of word, put 'surface' instead.
* Example
>>> pos_condition = [('名詞', '一般'), ('形容詞', '自立'), ('助詞', '格助詞', '一般')]
>>> stopwords = ['これ', 'それ']
"""
assert isinstance(pos_condition, (type(None), list))
assert isinstance(stopwords, (type(None), list))
if stopwords is None:
s_words = []
elif six.PY2 and all((isinstance(s, str) for s in stopwords)):
"""under python2.x, from str into unicode"""
if is_normalize:
s_words = [func_normalizer(s.decode(self.string_encoding)) for s in stopwords]
else:
s_words = [s.decode(self.string_encoding) for s in stopwords]
else:
if is_normalize:
s_words = [func_normalizer(s) for s in stopwords]
else:
s_words = stopwords
if pos_condition is None:
p_condition = []
else:
p_condition = self.__check_pos_condition(pos_condition)
filtered_object = filter_words(
tokenized_obj=self,
valid_pos=p_condition,
stopwords=s_words,
check_field_name=check_field_name
)
assert isinstance(filtered_object, FilteredObject)
return filtered_object
class FilteredObject(TokenizedSenetence):
def __init__(self, sentence, tokenized_objects, pos_condition, stopwords):
# type: (str, List[TokenizedResult], List[str, ...], List[str])->None
super(FilteredObject, self).__init__(
sentence=sentence,
tokenized_objects=tokenized_objects
)
self.pos_condition=pos_condition
self.stopwords=stopwords
================================================
FILE: JapaneseTokenizer/init_logger.py
================================================
LOGGER_NAME = 'JapaneseTokenizer'
import logging
import sys
from logging import getLogger, Formatter, Logger, StreamHandler
# Formatter
custmoFormatter = Formatter(
fmt='[%(asctime)s]%(levelname)s - %(filename)s#%(funcName)s:%(lineno)d: %(message)s',
datefmt='Y/%m/%d %H:%M:%S'
)
# StreamHandler
STREAM_LEVEL = logging.DEBUG
STREAM_FORMATTER = custmoFormatter
STREAM = sys.stderr
st_handler = StreamHandler(stream=STREAM)
st_handler.setLevel(STREAM_LEVEL)
st_handler.setFormatter(STREAM_FORMATTER)
def init_logger(logger):
# type: (logging.Logger) -> logging.Logger
logger.addHandler(st_handler)
logger.propagate = False
return logger
================================================
FILE: JapaneseTokenizer/juman_wrapper/__init__.py
================================================
__author__ = 'kensuke-mi'
from .juman_wrapper import JumanWrapper
================================================
FILE: JapaneseTokenizer/juman_wrapper/juman_wrapper.py
================================================
# -*- coding: utf-8 -*-
# package module
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common import text_preprocess
from JapaneseTokenizer.datamodels import FilteredObject, TokenizedResult, TokenizedSenetence
from JapaneseTokenizer import init_logger
from JapaneseTokenizer.common.sever_handler import JumanppHnadler
# else
from typing import List, Union, Callable, Tuple
from six import text_type
from pyknp import MList
import logging
import sys
import os
import six
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
__author__ = 'kensuke-mi'
python_version = sys.version_info
try:
import pyknp
except ImportError:
logger.warning(msg='pyknp is not ready to use. Install first if you would like to use pyknp wrapper.')
if six.PY3:
import socket
import re
class MonkeyPatchSocket(object):
"""* Class for overwriting pyknp.Socket because it is only for python2.x"""
def __init__(self, hostname, port, option=None):
try:
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.sock.connect((hostname, port))
except:
raise
if option is not None:
self.sock.send(option)
data = b""
while b"OK" not in data:
# while isinstance(data, bytes) and b"OK" not in data:
data = self.sock.recv(1024)
def __del__(self):
if self.sock:
self.sock.close()
def query(self, sentence, pattern):
# type: (str,str)->str
assert(isinstance(sentence, six.text_type))
sentence_bytes = sentence.encode('utf-8').strip()
pattern_bytes = pattern.encode('utf-8')
self.sock.sendall(sentence_bytes + b"\n")
data = self.sock.recv(1024)
assert isinstance(data, bytes)
recv = data
while not re.search(pattern_bytes, recv):
data = self.sock.recv(1024)
recv = recv + data
return recv.strip().decode('utf-8')
class JumanWrapper(WrapperBase):
def __init__(self,
command='juman',
server=None,
port=32000,
timeout=30,
rcfile=None,
option='-e2 -B',
pattern='EOS',
is_use_pyknp=False,
**args):
# type: (text_type, text_type, int, int, text_type, Union[bytes, text_type], Union[bytes, text_type], bool, **str)->None
"""* Class to call Juman tokenizer
"""
self.timeout = timeout
self.pattern = pattern
self.option = option
self.command = command
if not rcfile is None and not os.path.exists(rcfile):
raise FileExistsError('rcfile does not exist at {}'.format(rcfile))
if not server is None:
# It converts from str into bytes only for sever mode #
self.option = self.option.encode('utf-8') # type: Union[str,bytes]
self.pattern = self.pattern.encode('utf-8') # type: Union[str,bytes]
else:
pass
# check os #
if os.name == 'nt':
if not is_use_pyknp:
logger.warning(msg='It forces is_use_pyknp = True on Windows.')
else:
pass
self.is_use_pyknp = True
else:
pass
if server is not None:
# use server mode #
self.juman = pyknp.Juman(command=command, server=server, port=port,
timeout=self.timeout, rcfile=rcfile, option=option,
pattern=pattern, jumanpp=False, **args)
if six.PY3:
# It overwrites juman_lines() method #
self.juman.juman_lines = self.__monkey_patch_juman_lines
elif is_use_pyknp and server is None:
# use unix process with pyknp
self.juman = pyknp.Juman(command=command, server=server, port=port,
timeout=self.timeout, rcfile=rcfile, option=option,
pattern=pattern, jumanpp=False, **args)
else:
# use unix process with pexpect(RECOMMENDED) #
self.juman = JumanppHnadler(jumanpp_command=command,
option=self.option,
pattern=self.pattern,
timeout_second=self.timeout)
def __del__(self):
if hasattr(self, "juman"):
if isinstance(self.juman, JumanppHnadler):
self.juman.stop_process()
def __monkey_patch_juman_lines(self, input_str):
# type: (text_type)->text_type
"""* What you can do
- It overwrites juman_line() method because this method causes TypeError in python3
"""
assert isinstance(self.juman, pyknp.Juman)
if not self.juman.socket and not self.juman.subprocess:
if self.juman.server is not None:
self.juman.socket = MonkeyPatchSocket(self.juman.server, self.juman.port, b"RUN -e2\n")
else:
command = "%s %s" % (self.juman.command, self.juman.option)
if self.juman.rcfile:
command += " -r %s" % self.juman.rcfile
self.juman.subprocess = pyknp.Subprocess(command)
if self.juman.socket:
return self.juman.socket.query(input_str, pattern=self.juman.pattern)
return self.juman.subprocess.query(input_str, pattern=self.juman.pattern)
def __extract_morphological_information(self, mrph_object, is_feature, is_surface):
"""This method extracts morphlogical information from token object.
"""
assert isinstance(mrph_object, pyknp.Morpheme)
assert isinstance(is_feature, bool)
assert isinstance(is_surface, bool)
surface = mrph_object.midasi
word_stem = mrph_object.genkei
tuple_pos = (mrph_object.hinsi, mrph_object.bunrui)
misc_info = {
'katuyou1': mrph_object.katuyou1,
'katuyou2': mrph_object.katuyou2,
'imis': mrph_object.imis,
'repname': mrph_object.repname
}
token_object = TokenizedResult(
node_obj=None,
tuple_pos=tuple_pos,
word_stem=word_stem,
word_surface=surface,
is_feature=is_feature,
is_surface=is_surface,
misc_info=misc_info
)
return token_object
def call_juman_interface(self, input_str):
# type: (text_type)->MList
if isinstance(self.juman, pyknp.Juman):
result = self.juman.analysis(input_str)
return result
elif isinstance(self.juman, JumanppHnadler):
try:
result_analysis = self.juman.query(input_str)
except UnicodeDecodeError:
logger.warning(msg="Process is down by some reason. It restarts process automatically.")
self.juman.restart_process()
result_analysis = self.juman.query(input_string=input_str)
return MList(result_analysis)
else:
raise Exception('Not defined.')
def tokenize(self,
sentence,
normalize=True,
is_feature=False,
is_surface=False,
return_list=False,
func_normalizer=text_preprocess.normalize_text):
# type: (text_preprocess, bool, bool, bool, bool, Callable[[str], text_type])->Union[List[text_type], TokenizedSenetence]
"""This method returns tokenized result.
If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
If return_list==False, this method returns TokenizedSenetence object.
"""
assert isinstance(normalize, bool)
assert isinstance(sentence, text_type)
normalized_sentence = func_normalizer(sentence)
result = self.call_juman_interface(normalized_sentence)
token_objects = [
self.__extract_morphological_information(
mrph_object=morph_object,
is_surface=is_surface,
is_feature=is_feature
)
for morph_object in result]
if return_list:
tokenized_objects = TokenizedSenetence(
sentence=sentence,
tokenized_objects=token_objects
)
return tokenized_objects.convert_list_object()
else:
tokenized_objects = TokenizedSenetence(
sentence=sentence,
tokenized_objects=token_objects)
return tokenized_objects
def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
# type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type])->FilteredObject
assert isinstance(parsed_sentence, TokenizedSenetence)
assert isinstance(pos_condition, (type(None), list))
assert isinstance(stopwords, (type(None), list))
return parsed_sentence.filter(pos_condition, stopwords)
================================================
FILE: JapaneseTokenizer/jumanpp_wrapper/__init__.py
================================================
from .jumanpp_wrapper import JumanppWrapper
================================================
FILE: JapaneseTokenizer/jumanpp_wrapper/jumanpp_wrapper.py
================================================
#! -*- coding: utf-8 -*-
from pyknp import Juman
from pyknp import MList
# modules
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common import text_preprocess, juman_utils
from JapaneseTokenizer.common.sever_handler import JumanppHnadler, ProcessDownException
from JapaneseTokenizer import init_logger
from JapaneseTokenizer.datamodels import FilteredObject, TokenizedSenetence
from typing import List, Dict, Tuple, Union, TypeVar, Any, Callable
# timeout
from JapaneseTokenizer.common.timeout_handler import on_timeout
from six import text_type
import logging
import sys
import socket
import six
import re
import os
__author__ = 'kensuke-mi'
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
python_version = sys.version_info
ContentsTypes = TypeVar('T')
try:
import pyknp
except ImportError:
logger.warning(msg='pyknp is not ready to use. Install first if you would like to use pyknp wrapper.')
if six.PY2:
ConnectionRefusedError = Exception
class JumanppClient(object):
"""Class for receiving data as client"""
def __init__(self, hostname, port, timeout=50, option=None):
# type: (text_type, int, int, Dict[text_type,Any])->None
try:
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
if isinstance(port, text_type):
port = int(port)
self.sock.connect((hostname, port))
except:
raise Exception("There is no jumanpp server hostname={}, port={}".format(hostname, port))
if option is not None:
self.sock.send(option)
data = ''
self.sock.settimeout(timeout)
def __del__(self):
if self.sock: self.sock.close()
def query(self, sentence, pattern):
# type: (text_type, bytes) -> text_type
assert (isinstance(sentence, six.text_type))
data = ''
self.sock.sendall("%s\n" % sentence.encode('utf-8').strip())
data = self.sock.recv(1024)
assert isinstance(data, bytes)
recv = data
while not re.search(pattern, recv):
data = self.sock.recv(1024)
recv = "%s%s" % (recv, data)
return recv.strip().decode('utf-8')
else:
class JumanppClient(object):
"""Class for receiving data as client"""
def __init__(self, hostname, port, timeout=50, option=None):
# type: (text_type, int, int, Dict[text_type,Any])->None
try:
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
if isinstance(port, str):
port = int(port)
self.sock.connect((hostname, port))
except ConnectionRefusedError:
raise Exception("There is no jumanpp server hostname={}, port={}".format(hostname, port))
except:
raise
if option is not None:
self.sock.send(option)
data = b""
self.sock.settimeout(timeout)
def __del__(self):
if self.sock:
self.sock.close()
def query(self, sentence, pattern):
# type: (str, Union[str,bytes]) -> str
assert (isinstance(sentence, six.text_type))
if isinstance(pattern, str):
pattern = pattern.encode('utf-8')
self.sock.sendall(b"%s\n" % sentence.encode('utf-8').strip())
data = self.sock.recv(1024)
assert isinstance(data, bytes)
recv = data
while not re.search(pattern, recv):
data = self.sock.recv(1024)
recv = b"%s%s" % (recv, data)
return recv.strip().decode('utf-8')
class JumanppWrapper(WrapperBase):
"""Class for Juman++"""
def __init__(self,
command='jumanpp',
timeout=30,
pattern=r'EOS',
server=None,
port=12000,
is_use_pyknp = False,
** args):
# type: (text_type,int,text_type,text_type,bool)
"""* What you can do
- You can select backend process of jumanpp.
- jumanpp-pexpect: It calls jumanpp on your local machine. It keeps jumanpp process running.
- jumanpp-pyknp: It calls jumanpp on your local machine. It launches jumanpp process everytime you call. Thus, this is slower than jumanpp-pexpect
- jumanpp-server: It calls jumannpp on somewhere else. Keep mind, you have jumanpp sever process somewhere.
* Parameters
- timeout: Time to wait from jumanpp process.
- is_use_pyknp: bool flag to decide if you use pyknp as backend process. If True; you use pyknp. False; you use pexpect.
pexpect is much faster than you use pyknp. You can not use pexpect if you're using it on Windowns
- server: hostname where jumanpp is running
- port: port number where jumanpp is running
"""
self.eos_pattern = pattern
self.is_use_pyknp = is_use_pyknp
if six.PY2:
self.dummy_text = 'これはダミーテキストです'.decode('utf-8')
elif six.PY3:
self.dummy_text = 'これはダミーテキストです'
if not server is None:
pattern = pattern.encode('utf-8')
else:
pass
if os.name == 'nt':
"""It forces to use pyknp if it runs on Windows."""
if not self.is_use_pyknp:
logger.warning(msg="You're not able to use pexpect in Windows. It forced to set is_use_pyknp = True")
else:
pass
self.is_use_pyknp = True
else:
pass
if server is None and self.is_use_pyknp:
# jumanpp-pexpect #
logger.debug('jumanpp wrapper is initialized with pyknp package')
self.jumanpp_obj = Juman(
command=command,
timeout=timeout,
pattern=pattern,
jumanpp=True,
**args)
elif server is None:
# jumanpp-pexpect #
logger.debug('jumanpp wrapper is initialized with pexpect unix handler')
self.jumanpp_obj = JumanppHnadler(jumanpp_command=command, timeout_second=timeout, pattern=pattern) # type: JumanppHnadler
# put dummy sentence to avoid exception just after command initialization #
res = self.jumanpp_obj.query(self.dummy_text)
else:
# jumanpp-server #
self.jumanpp_obj = JumanppClient(hostname=server, port=port, timeout=timeout)
def __del__(self):
if hasattr(self, "jumanpp_obj"):
if isinstance(self.jumanpp_obj, JumanppClient):
self.jumanpp_obj.sock.close()
elif isinstance(self.jumanpp_obj, JumanppHnadler):
self.jumanpp_obj.stop_process()
else:
del self.jumanpp_obj
else:
pass
def call_juman_interface(self, input_str):
# type: (text_type) -> MList
"""* What you can do
- You call Juman tokenizer interface.
* Output
- pyknp.MList
"""
if isinstance(self.jumanpp_obj, Juman):
ml_token_object = self.jumanpp_obj.analysis(input_str=input_str)
elif isinstance(self.jumanpp_obj, JumanppHnadler):
try:
result_token = self.jumanpp_obj.query(input_string=input_str)
except ProcessDownException:
"""Unix process is down by any reason."""
logger.warning("Re-starting unix process because it takes longer time than {} seconds...".format(self.jumanpp_obj.timeout_second))
self.jumanpp_obj.restart_process()
self.jumanpp_obj.query(self.dummy_text)
result_token = self.jumanpp_obj.query(input_string=input_str)
ml_token_object = MList(result_token)
except UnicodeDecodeError:
logger.warning(msg="Process is down by some reason. It restarts process automatically.")
self.jumanpp_obj.restart_process()
self.jumanpp_obj.query(self.dummy_text)
result_token = self.jumanpp_obj.query(input_string=input_str)
ml_token_object = MList(result_token)
else:
ml_token_object = MList(result_token)
elif isinstance(self.jumanpp_obj, JumanppClient):
server_response = self.jumanpp_obj.query(sentence=input_str, pattern=self.eos_pattern)
ml_token_object = MList(server_response)
else:
raise Exception('Not defined')
return ml_token_object
@on_timeout(limit=60)
def tokenize(self, sentence,
normalize=True,
is_feature=False,
is_surface=False,
return_list=False,
func_normalizer=text_preprocess.normalize_text):
# type: (text_type, bool, bool, bool, bool, Callable[[text_type], text_type]) -> Union[TokenizedSenetence, List[text_type]]
"""* What you can do
-
"""
if normalize:
normalized_sentence = func_normalizer(sentence)
else:
normalized_sentence = sentence
ml_token_object = self.call_juman_interface(normalized_sentence)
token_objects = [
juman_utils.extract_morphological_information(
mrph_object=morph_object,
is_surface=is_surface,
is_feature=is_feature
)
for morph_object in ml_token_object]
if return_list:
tokenized_objects = TokenizedSenetence(
sentence=sentence,
tokenized_objects=token_objects)
return tokenized_objects.convert_list_object()
else:
tokenized_objects = TokenizedSenetence(
sentence=sentence,
tokenized_objects=token_objects)
return tokenized_objects
def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
# type: (TokenizedSenetence, List[Tuple[text_type,...]], List[text_type]) -> FilteredObject
assert isinstance(parsed_sentence, TokenizedSenetence)
assert isinstance(pos_condition, (type(None), list))
assert isinstance(stopwords, (type(None), list))
return parsed_sentence.filter(pos_condition, stopwords)
================================================
FILE: JapaneseTokenizer/kytea_wrapper/__init__.py
================================================
__author__ = 'kensuke-mi'
from .kytea_wrapper import KyteaWrapper
================================================
FILE: JapaneseTokenizer/kytea_wrapper/kytea_wrapper.py
================================================
# -*- coding: utf-8 -*-
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common import text_preprocess
from JapaneseTokenizer.datamodels import FilteredObject, TokenizedResult, TokenizedSenetence
from JapaneseTokenizer import init_logger
from typing import List, Tuple, Any, Union, Callable
from six import text_type, string_types
import logging
import sys
import six
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
python_version = sys.version_info
try:
import Mykytea
except ImportError:
logger.warning(msg='Mykytea is not ready to use yet. Install first if you would like to use kytea wrapper.')
__author__ = 'kensuke-mi'
class KyteaWrapper(WrapperBase):
def __init__(self,
option_string='-deftag UNKNOWN!!'):
# type: (string_types)->None
# option string is argument of Kytea.
assert isinstance(option_string, string_types)
self.kytea = Mykytea.Mykytea(option_string)
def __list_tags(self, t):
def convert(t2): return (t2[0], t2[1])
return [(word.surface, [[convert(t2) for t2 in t1] for t1 in word.tag]) for word in t]
def __check_char_set(self, input_char):
# type: (text_type) -> text_type
if six.PY2 and isinstance(input_char, str):
return input_char.decode('utf-8')
elif isinstance(input_char, text_type):
return input_char
else:
raise Exception('nor unicode, str')
def __extract_morphological_information(self, kytea_tags_tuple, is_feature):
# type: (Tuple[text_type,List[Any]], bool) -> TokenizedResult
"""This method extracts morphlogical information from token object.
"""
assert isinstance(kytea_tags_tuple, tuple)
assert isinstance(is_feature, bool)
surface = self.__check_char_set(kytea_tags_tuple[0])
# NOTE: kytea does NOT show word stem. Put blank string instead.
if six.PY2:
word_stem = ''.decode('utf-8')
else:
word_stem = ''
pos_tuple = kytea_tags_tuple[1][0]
pos = self.__check_char_set(pos_tuple[0][0])
pos_score = float(pos_tuple[0][1])
yomi_tuple = kytea_tags_tuple[1][1]
yomi = self.__check_char_set(yomi_tuple[0][0])
yomi_score = float(yomi_tuple[0][1])
tuple_pos = (pos, )
misc_info = {
'pos_score': pos_score,
'pos': pos,
'yomi': yomi,
'yomi_score': yomi_score
}
token_object = TokenizedResult(
node_obj=None,
tuple_pos=tuple_pos,
word_stem=word_stem,
word_surface=surface,
is_feature=is_feature,
is_surface=True,
misc_info=misc_info
)
return token_object
def call_kytea_tokenize_api(self, sentence):
"""
"""
result = self.kytea.getTagsToString(sentence)
assert isinstance(result, text_type)
return result
def tokenize(self, sentence,
normalize=True,
is_feature=False,
is_surface=False,
return_list=False,
func_normalizer=text_preprocess.normalize_text):
# type: (text_type, bool, bool, bool, bool, Callable[[str],str]) -> Union[List[str], TokenizedSenetence]
"""This method returns tokenized result.
If return_list==True(default), this method returns list whose element is tuple consisted with word_stem and POS.
If return_list==False, this method returns TokenizedSenetence object.
"""
assert isinstance(normalize, bool)
assert isinstance(sentence, text_type)
normalized_sentence = func_normalizer(sentence)
if six.PY2:
normalized_sentence = normalized_sentence.encode('utf-8')
result = self.__list_tags(self.kytea.getTags(normalized_sentence))
token_objects = [
self.__extract_morphological_information(
kytea_tags_tuple=kytea_tags,
is_feature=is_feature
)
for kytea_tags in result]
if return_list:
tokenized_objects = TokenizedSenetence(
sentence=sentence,
tokenized_objects=token_objects
)
return tokenized_objects.convert_list_object()
else:
tokenized_objects = TokenizedSenetence(
sentence=sentence,
tokenized_objects=token_objects)
return tokenized_objects
def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
assert isinstance(parsed_sentence, TokenizedSenetence)
assert isinstance(pos_condition, (type(None), list))
assert isinstance(stopwords, (type(None), list))
return parsed_sentence.filter(pos_condition, stopwords, check_field_name='surface')
================================================
FILE: JapaneseTokenizer/mecab_wrapper/__init__.py
================================================
__author__ = 'kensuke-mi'
from .mecab_wrapper import MecabWrapper
================================================
FILE: JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py
================================================
#! -*- coding: utf-8 -*-
# core module
from JapaneseTokenizer.object_models import WrapperBase
from JapaneseTokenizer.common.text_preprocess import normalize_text
from JapaneseTokenizer import init_logger
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
import MeCab
# else
import sys
import os
import logging
import subprocess
import six
from six import text_type
# typing
from typing import List, Tuple, Union, TypeVar, Callable
ContentsTypes = TypeVar('T')
__author__ = 'kensuke-mi'
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
python_version = sys.version_info
try:
import neologdn
is_neologdn_valid = True
except:
logger.warning("neologdn package is not installed yet. You could not call neologd dictionary.")
is_neologdn_valid = False
class MecabWrapper(WrapperBase):
def __init__(self,
dictType,
pathUserDictCsv=None,
path_mecab_config=None,
path_dictionary=None,
string_encoding='utf-8'):
# type: (text_type, text_type, text_type, text_type, text_type)->None
"""
:param dictType: a dictionary type called by mecab
:param pathUserDictCsv: path to your original dictionary file
:param path_mecab_config: path to 'mecab_config' command. It's automatically detected if not give
:param path_dictionary: path to a dictionary which you want to use. If not given, it's automatically detected
:param string_encoding: encoding option to parse command line result. This is mainly used for python2.x
"""
self.string_encoding = string_encoding
self._dictType = dictType
self._pathUserDictCsv = pathUserDictCsv
self._path_dictionary = path_dictionary
if path_mecab_config is None:
self._path_mecab_config = self.__get_path_to_mecab_config()
else:
self._path_mecab_config = path_mecab_config
if self._path_dictionary is not None:
assert os.path.exists(self._path_dictionary), 'Path dictionary is NOT exist.'
self._mecab_dictionary_path = None
else:
self._mecab_dictionary_path = self.__check_mecab_dict_path()
logger.info("mecab dictionary path is detected under {}".format(self._mecab_dictionary_path))
self.mecabObj = self.__CallMecab()
assert dictType in ["neologd", "all", "ipadic", "ipaddic", "user", "", "jumandic", "unidic", None], \
'Dictionary Type Error. Your dict = {} is NOT available.'
if dictType == 'all':
logger.error('dictionary type "all" is deprecated from version1.6')
raise Exception('dictionary type "all" is deprecated from version1.6')
if dictType == 'user':
logger.error('dictionary type "user" is deprecated from version1.6. You just give path to dictionary csv.')
raise Exception('dictionary type "all" is deprecated from version1.6. You just give path to dictionary csv.')
if pathUserDictCsv is not None and isinstance(pathUserDictCsv, text_type) and pathUserDictCsv != '':
assert os.path.exists(pathUserDictCsv), \
'Your user dictionary does NOT exist. Path={}'.format(pathUserDictCsv)
def __get_path_to_mecab_config(self):
"""You get path into mecab-config
"""
if six.PY2:
path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config'])
path_mecab_config_dir = path_mecab_config_dir.strip().replace('/mecab-config', '')
else:
path_mecab_config_dir = subprocess.check_output(['which', 'mecab-config']).decode(self.string_encoding)
path_mecab_config_dir = path_mecab_config_dir.strip().replace('/mecab-config', '')
logger.info(msg='mecab-config is detected at {}'.format(path_mecab_config_dir))
return path_mecab_config_dir
def __check_mecab_dict_path(self):
"""check path to dict of Mecab in system environment
"""
mecab_dic_cmd = "echo `{} --dicdir`".format(os.path.join(self._path_mecab_config, 'mecab-config'))
try:
if six.PY2:
path_mecab_dict = subprocess.check_output( mecab_dic_cmd, shell=True ).strip('\n')
else:
path_mecab_dict = subprocess.check_output(mecab_dic_cmd, shell=True).decode(self.string_encoding).strip('\n')
except subprocess.CalledProcessError:
logger.error("{}".format(mecab_dic_cmd))
raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config command")
if path_mecab_dict == '':
raise SystemError("""mecab dictionary path is not found with following command: {}
You are not able to use additional dictionary.
Still you are able to call mecab default dictionary""".format(mecab_dic_cmd))
return path_mecab_dict
def __check_mecab_libexe(self):
mecab_libexe_cmd = "echo `{} --libexecdir`".format(os.path.join(self._path_mecab_config, 'mecab-config'))
try:
if six.PY2:
path_mecab_libexe = subprocess.check_output( mecab_libexe_cmd, shell=True ).strip('\n')
else:
path_mecab_libexe = subprocess.check_output(mecab_libexe_cmd, shell=True).decode(self.string_encoding).strip('\n')
except subprocess.CalledProcessError:
logger.error("{}".format(mecab_libexe_cmd))
raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to execute mecab-config --libexecdir")
if path_mecab_libexe == '':
raise SystemError("""Mecab config is not callable with following command: {}
You are not able to compile your user dictionary.
Still, you are able to use default mecab dictionary.""".format(mecab_libexe_cmd))
return path_mecab_libexe
def __CallMecab(self):
if self._path_dictionary is not None and self._mecab_dictionary_path is None:
logger.debug('Use dictionary you specified.')
cmMecabInitialize = '-d {}'.format(self._path_dictionary)
elif self._dictType == 'neologd':
# use neologd
logger.debug('Use neologd additional dictionary')
cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "mecab-ipadic-neologd"))
elif self._dictType == 'ipadic' or self._dictType == 'ipaddic':
# use ipadic
logger.debug('Use ipadic dictionary')
cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "ipadic"))
elif six.PY2 is False and self._dictType == 'jumandic':
# use jumandic. This is impossible to call in Python2.x
logger.debug('Use jumandic dictionary')
cmMecabInitialize = '-d {}'.format(os.path.join(self._mecab_dictionary_path, "jumandic"))
elif six.PY2 and self._dictType == 'jumandic':
raise Exception('In python2.x, impossible to call jumandic.')
else:
logger.debug('Use no default dictionary')
cmMecabInitialize = ''
# execute compile if user dictionary is given
if self._pathUserDictCsv is not None:
logger.debug('Use User dictionary')
pathUserDict = self.__CompileUserdict()
cmMecabInitialize += ' -u {}'.format(pathUserDict)
if six.PY2:
cmMecabCall = "-Ochasen {}".format(cmMecabInitialize)
else:
cmMecabCall = "{}".format(cmMecabInitialize)
logger.debug(msg="mecab initialized with {}".format(cmMecabCall))
try:
mecabObj = MeCab.Tagger(cmMecabCall)
except Exception as e:
logger.error(e.args)
logger.error("Possibly Path to userdict is invalid. Check the path")
raise subprocess.CalledProcessError(returncode=-1, cmd="Failed to initialize Mecab object")
return mecabObj
def __CompileUserdict(self):
"""* What you can do
"""
path_mecab_dict = self.__check_mecab_dict_path()
path_mecab_libexe = self.__check_mecab_libexe()
cmCompileDict = u'{0}/mecab-dict-index -d {1}/ipadic -u {2} -f utf-8 -t utf-8 {3} > /dev/null'.format(path_mecab_libexe,
path_mecab_dict,
self._pathUserDictCsv.replace("csv", "dict"),
self._pathUserDictCsv)
logger.debug(msg="compiling mecab user dictionary with: {}".format(cmCompileDict))
try:
subprocess.call( cmCompileDict , shell=True )
except OSError as e:
logger.error('type:' + str(type(e)))
logger.error('args:' + str(e.args))
sys.exit('Failed to compile mecab userdict. System ends')
return self._pathUserDictCsv.replace("csv", "dict")
def __feature_parser(self, uni_feature, word_surface):
"""
Parse the POS feature output by Mecab
:param uni_feature unicode:
:return ( (pos1, pos2, pos3), word_stem ):
"""
list_feature_items = uni_feature.split((','))
# if word has no feature at all
if len(list_feature_items)==1: return ('*'), ('*')
pos1 = list_feature_items[0]
pos2 = list_feature_items[1]
pos3 = list_feature_items[2]
tuple_pos = ( pos1, pos2, pos3 )
# if without constraint(output is normal mecab dictionary like)
if len(list_feature_items) == 9:
word_stem = list_feature_items[6]
# if with constraint(output format depends on Usedict.txt)
else:
word_stem = word_surface
return tuple_pos, word_stem
def __postprocess_analyzed_result(self, string_mecab_parsed_result, is_feature, is_surface):
# type: (text_type,bool,bool)->List[TokenizedResult]
"""Extract surface word and feature from analyzed lines.
Extracted results are returned with list, whose elements are TokenizedResult class
[TokenizedResult]
"""
assert isinstance(string_mecab_parsed_result, str)
check_tab_separated_line = lambda x: True if '\t' in x else False
tokenized_objects = [
self.__result_parser(analyzed_line=analyzed_line,
is_feature=is_feature,
is_surface=is_surface)
for analyzed_line in string_mecab_parsed_result.split('\n')
if not analyzed_line=='EOS' and check_tab_separated_line(analyzed_line)
]
assert isinstance(tokenized_objects, list)
return tokenized_objects
def __result_parser(self, analyzed_line, is_feature, is_surface):
# type: (text_type,bool,bool)->TokenizedResult
"""Extract surface word and feature from analyzed line.
Extracted elements are returned with TokenizedResult class
"""
assert isinstance(analyzed_line, str)
assert isinstance(is_feature, bool)
assert isinstance(is_surface, bool)
surface, features = analyzed_line.split('\t', 1)
tuple_pos, word_stem = self.__feature_parser(features, surface)
tokenized_obj = TokenizedResult(
node_obj=None,
analyzed_line=analyzed_line,
tuple_pos=tuple_pos,
word_stem=word_stem,
word_surface=surface,
is_feature=is_feature,
is_surface=is_surface
)
return tokenized_obj
def tokenize(self, sentence,
normalized=True,
is_feature=False,
is_surface=False,
return_list=False,
func_normalizer=normalize_text):
# type: (text_type, bool, bool, bool, bool, Callable[[str], str])->Union[List[str], TokenizedSenetence]
"""* What you can do
- Call mecab tokenizer, and return tokenized objects
"""
if six.PY2 and isinstance(sentence, str):
sentence = sentence.decode(self.string_encoding)
else:
pass
# decide normalization function depending on dictType
if func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid:
normalized_sentence = neologdn.normalize(sentence)
elif func_normalizer is None and self._dictType == 'neologd' and is_neologdn_valid == False:
raise Exception("You could not call neologd dictionary bacause you do NOT install the package neologdn.")
elif func_normalizer == normalize_text:
normalized_sentence = normalize_text(sentence, dictionary_mode=self._dictType)
elif func_normalizer is None:
normalized_sentence = sentence
else:
normalized_sentence = func_normalizer(sentence)
# don't delete this variable. The variable "encoded_text" protects sentence from deleting
if six.PY2:
encoded_text = normalized_sentence.encode(self.string_encoding)
else:
encoded_text = normalized_sentence
if six.PY2:
tokenized_objects = []
node = self.mecabObj.parseToNode(encoded_text)
node = node.next
while node.next is not None:
word_surface = node.surface.decode(self.string_encoding)
tuple_pos, word_stem = self.__feature_parser(node.feature.decode(self.string_encoding), word_surface)
tokenized_obj = TokenizedResult(
node_obj=node,
tuple_pos=tuple_pos,
word_stem=word_stem,
word_surface=word_surface,
is_feature=is_feature,
is_surface=is_surface
)
tokenized_objects.append(tokenized_obj)
node = node.next
tokenized_sentence = TokenizedSenetence(
sentence=sentence,
tokenized_objects=tokenized_objects)
else:
parsed_result = self.mecabObj.parse(encoded_text)
tokenized_objects = self.__postprocess_analyzed_result(
string_mecab_parsed_result=parsed_result,
is_feature=is_feature,
is_surface=is_surface
)
tokenized_sentence = TokenizedSenetence(
sentence=sentence,
tokenized_objects=tokenized_objects
) # type: TokenizedSenetence
if return_list:
return tokenized_sentence.convert_list_object()
else:
return tokenized_sentence
def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
# type: (TokenizedSenetence, List[Tuple[str,...]], List[str]) -> FilteredObject
assert isinstance(parsed_sentence, TokenizedSenetence)
assert isinstance(pos_condition, (type(None), list))
assert isinstance(stopwords, (type(None), list))
return parsed_sentence.filter(pos_condition, stopwords)
================================================
FILE: JapaneseTokenizer/object_models.py
================================================
#! -*- coding: utf-8 -*-
from typing import Callable
from six import text_type
class WrapperBase(object):
def tokenize(self,
sentence,
normalize,
is_feature,
is_surface,
return_list,
func_normalizer=None):
# type: (text_type, bool, bool, bool, bool, Callable[[text_type], text_type])->None
"""* What you can do"""
raise NotImplemented
def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
raise NotImplemented
================================================
FILE: LICENSE.txt
================================================
Copyright 2017 Kensuke Mitsuzawa
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
================================================
FILE: MANIFEST.in
================================================
include README.md
include README_JP.md
include examples
include test
include install_tokenizers.sh
include LICENSE.txt
include Makefile
================================================
FILE: Makefile
================================================
install:
bash install_tokenizers.sh
install_neologd:
## mecab-neologdのインストールを実行
wget --no-check-certificate https://github.com/neologd/mecab-ipadic-neologd/tarball/master -O mecab-ipadic-neologd.tar
tar -xvf mecab-ipadic-neologd.tar
mv neologd-mecab-ipadic-neologd-* neologd-mecab-ipadic-neologd && cd neologd-mecab-ipadic-neologd && ( echo yes | ./bin/install-mecab-ipadic-neologd )
================================================
FILE: README.md
================================================
[](LICENSE)[](https://travis-ci.org/Kensuke-Mitsuzawa/JapaneseTokenizers)
# What's this?
This is simple python-wrapper for Japanese Tokenizers(A.K.A Tokenizer)
This project aims to call tokenizers and split a sentence into tokens as easy as possible.
And, this project supports various Tokenization tools common interface. Thus, it's easy to compare output from various tokenizers.
This project is available also in [Github](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers).
If you find any bugs, please report them to github issues. Or any pull requests are welcomed!
# Requirements
- Python 2.7
- Python 3.x
- checked in 3.5, 3.6, 3.7
# Features
* simple/common interface among various tokenizers
* simple/common interface for filtering with stopwords or Part-of-Speech condition
* simple interface to add user-dictionary(mecab only)
## Supported Tokenizers
### Mecab
[Mecab](http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html?sess=3f6a4f9896295ef2480fa2482de521f6) is open source tokenizer system for various language(if you have dictionary for it)
See [english documentation](https://github.com/jordwest/mecab-docs-en) for detail
### Juman
[Juman](http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?JUMAN) is a tokenizer system developed by Kurohashi laboratory, Kyoto University, Japan.
Juman is strong for ambiguous writing style in Japanese, and is strong for new-comming words thanks to Web based huge dictionary.
And, Juman tells you semantic meaning of words.
### Juman++
[Juman++](http://nlp.ist.i.kyoto-u.ac.jp/EN/index.php?JUMAN++) is a tokenizer system developed by Kurohashi laboratory, Kyoto University, Japan.
Juman++ is succeeding system of Juman. It adopts RNN model for tokenization.
Juman++ is strong for ambigious writing style in Japanese, and is strong for new-comming words thanks to Web based huge dictionary.
And, Juman tells you semantic meaning of words.
Note: New Juman++ dev-version(later than 2.x) is available at [Github](https://github.com/ku-nlp/jumanpp)
### Kytea
[Kytea](http://www.phontron.com/kytea/) is tokenizer tool developped by Graham Neubig.
Kytea has a different algorithm from one of Mecab or Juman.
# Setting up
## Tokenizers auto-install
```
make install
```
### mecab-neologd dictionary auto-install
```
make install_neologd
```
## Tokenizers manual-install
### MeCab
See [here](https://github.com/jordwest/mecab-docs-en) to install MeCab system.
### Mecab Neologd dictionary
Mecab-neologd dictionary is a dictionary-extension based on ipadic-dictionary, which is basic dictionary of Mecab.
With, Mecab-neologd dictionary, you're able to parse new-coming words make one token.
Here, new-coming words is such like, movie actor name or company name.....
See [here](https://github.com/neologd/mecab-ipadic-neologd) and install mecab-neologd dictionary.
### Juman
```
wget -O juman7.0.1.tar.bz2 "http://nlp.ist.i.kyoto-u.ac.jp/DLcounter/lime.cgi?down=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2&name=juman-7.01.tar.bz2"
bzip2 -dc juman7.0.1.tar.bz2 | tar xvf -
cd juman-7.01
./configure
make
[sudo] make install
```
## Juman++
* GCC version must be >= 5
```
wget http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz
tar xJvf jumanpp-1.02.tar.xz
cd jumanpp-1.02/
./configure
make
[sudo] make install
```
## Kytea
Install Kytea system
```
wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz
tar -xvf kytea-0.4.7.tar
cd kytea-0.4.7
./configure
make
make install
```
Kytea has [python wrapper](https://github.com/chezou/Mykytea-python) thanks to michiaki ariga.
Install Kytea-python wrapper
```
pip install kytea
```
## install
```
[sudo] python setup.py install
```
### Note
During install, you see warning message when it fails to install `pyknp` or `kytea`.
if you see these messages, try to re-install these packages manually.
# Usage
Tokenization Example(For python3.x. To see exmaple code for Python2.x, plaese see [here](https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers/blob/master/examples/examples.py))
```
import JapaneseTokenizer
input_sentence = '10日放送の「中居正広のミになる図書館」(テレビ朝日系)で、SMAPの中居正広が、篠原信一の過去の勘違いを明かす一幕があった。'
# ipadic is well-maintained dictionary #
mecab_wrapper = JapaneseTokenizer.MecabWrapper(dictType='ipadic')
print(mecab_wrapper.tokenize(input_sentence).convert_list_object())
# neologd is automatically-generated dictionary from huge web-corpus #
mecab_neologd_wrapper = JapaneseTokenizer.MecabWrapper(dictType='neologd')
print(mecab_neologd_wrapper.tokenize(input_sentence).convert_list_object())
```
## Filtering example
```
import JapaneseTokenizer
# with word filtering by stopword & part-of-speech condition #
print(mecab_wrapper.tokenize(input_sentence).filter(stopwords=['テレビ朝日'], pos_condition=[('名詞', '固有名詞')]).convert_list_object())
```
## Part-of-speech structure
Mecab, Juman, Kytea have different system of Part-of-Speech(POS).
You can check tables of Part-of-Speech(POS) [here](http://www.unixuser.org/~euske/doc/postag/)
# Similar Package
## natto-py
natto-py is sophisticated package for tokenization. It supports following features
* easy interface for tokenization
* importing additional dictionary
* partial parsing mode
# LICENSE
MIT license
# For developers
You could build an environment which has dependencies to test this package.
Simply, you build docker image and run docker container.
## Dev environment
Develop environment is defined with `test/docker-compose-dev.yml`.
With the docker-compose.yml file, you could call python2.7 or python3.7
If you're using Pycharm Professional edition, you could set docker-compose.yml as remote interpreter.
To call python2.7, set `/opt/conda/envs/p27/bin/python2.7`
To call python3.7, set `/opt/conda/envs/p37/bin/python3.7`
## Test environment
These commands checks from procedures of package install until test of package.
```bash
$ docker-compose build
$ docker-compose up
```
================================================
FILE: examples/examples.py
================================================
#! -*- coding: utf-8 -*-
import sys
import os
from JapaneseTokenizer import JumanWrapper
from JapaneseTokenizer import JumanppWrapper
from JapaneseTokenizer import MecabWrapper
from JapaneseTokenizer import KyteaWrapper
from JapaneseTokenizer.datamodels import TokenizedResult
from JapaneseTokenizer import init_logger
import logging
import socket
import six
logger = init_logger.init_logger(logging.getLogger(init_logger.LOGGER_NAME))
__author__ = 'kensuke-mi'
logger.setLevel(logging.DEBUG)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# for python2.x
def basic_example():
# ========================================================
# TOKENIZE
# ========================================================
if six.PY2:
# input is `unicode` type(in python2x)
sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
elif six.PY3:
sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
else:
raise Exception()
# make MecabWrapper object
# you can choose from "neologd", "all", "ipadic", "user", "", None
# "ipadic" and "" is equivalent
mecab_wrapper = MecabWrapper(dictType="neologd")
juman_wrapper = JumanWrapper()
jumanpp_wrapper = JumanppWrapper()
#kytea_wrapper = KyteaWrapper()
# tokenize sentence into list of token.
# with is_feature=True, you get part-of-speech tag also. in this case, you get tuple ( token, (part-of-speech-tags) )
# with is_surface=True, you get surface form of token (in other words, not normalized token)
seq_tokens_mecab = mecab_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object()
seq_tokens_juman = juman_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object()
seq_tokens_jumanpp = jumanpp_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).convert_list_object()
#seq_tokens_kytea = kytea_wrapper.tokenize(sentence=sentence, is_feature=True, is_surface=False).convert_list_object()
logger.debug(seq_tokens_mecab)
logger.debug(seq_tokens_juman)
logger.debug(seq_tokens_jumanpp)
#logger.debug(seq_tokens_kytea)
def filtering_example():
if six.PY2:
# input is `unicode` type(in python2x)
sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
stopwords = [u'テヘラン']
pos_condition_ipadic = [(u'名詞', u'固有名詞'), (u'名詞', u'一般')]
pos_condition_juman = [(u'名詞', u'固有名詞'), (u'名詞', u'普通名詞')]
pos_condition_kytea = [(u'名詞',)]
elif six.PY3:
sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
stopwords = ['テヘラン']
pos_condition_ipadic = [('名詞', '固有名詞'), ('名詞', '一般')]
pos_condition_juman = [('名詞', '固有名詞'), ('名詞', '普通名詞')]
pos_condition_kytea = [('名詞',)]
else:
raise Exception()
# ========================================================
# FILTERING
# ========================================================
# you can filter tokens by stopwords or POS conditions
# stopword is list objetc
mecab_wrapper = MecabWrapper(dictType="neologd")
juman_wrapper = JumanWrapper()
jumanpp_wrapper = JumanppWrapper()
#kytea_wrapper = KyteaWrapper()
seq_tokens_mecab = mecab_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_ipadic,stopwords=stopwords).convert_list_object()
seq_tokens_juman = juman_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_juman, stopwords=stopwords).convert_list_object()
seq_tokens_jumanpp = jumanpp_wrapper.tokenize(sentence=sentence, is_feature=False, is_surface=False).filter(pos_condition=pos_condition_juman, stopwords=stopwords).convert_list_object()
#seq_tokens_kytea = kytea_wrapper.tokenize(sentence=sentence, is_feature=True, is_surface=False).filter(pos_condition=pos_condition_kytea, stopwords=stopwords).convert_list_object()
logger.debug(seq_tokens_mecab)
logger.debug(seq_tokens_juman)
logger.debug(seq_tokens_jumanpp)
#logger.debug(seq_tokens_kytea)
def advanced_example_mecab():
if six.PY2:
sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
elif six.PY3:
sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
else:
raise Exception()
# ========================================================
# USE YOUE OWN DICTIONARY
# with your own dictionary, you can force Mecab to make some word into one token
# ========================================================
# make your own "user dictionary" with CSV file
# To know more about this file, see this page(sorry, Japanese only) https://mecab.googlecode.com/svn/trunk/mecab/doc/dic.html
example_user_dict = "userdict.csv"
# set dictType='user' or dictType='all' and set pathUserDictCsv
tokenized_obj = MecabWrapper(dictType='user', pathUserDictCsv=example_user_dict).tokenize(sentence)
for token_obj in tokenized_obj.tokenized_objects:
assert isinstance(token_obj, TokenizedResult)
if six.PY2 and token_obj.word_stem == u'ペルシア語':
logger.debug(token_obj.word_stem)
elif six.PY3 and token_obj.word_stem == 'ペルシア語':
logger.debug(token_obj.word_stem)
## TokenizedResult class has attributes of tokenized result ##
token_obj.analyzed_line
token_obj.word_surface
token_obj.word_stem
token_obj.tuple_pos
def advanced_example_juman():
if six.PY2:
sentence = u'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
pos_condition = [(u'名詞',)]
elif six.PY3:
sentence = 'テヘランは、西アジア、イランの首都でありかつテヘラン州の州都。人口12,223,598人。都市圏人口は13,413,348人に達する。'
pos_condition = [('名詞',)]
else:
raise Exception()
### You can call juman with server mode. You must start JUMAN as server mode beforehand ###
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
HOST='localhost'
PORT = 32000
try:
s.connect((HOST, PORT))
s.close()
juman_wrapper = JumanWrapper(server=HOST, port=PORT)
tokens_list = juman_wrapper.tokenize(sentence, return_list=False).filter(pos_condition=pos_condition).convert_list_object()
assert isinstance(tokens_list, list)
except:
logger.info(msg='Juman server is not running. Skip it.')
if __name__ == "__main__":
basic_example()
filtering_example()
advanced_example_mecab()
advanced_example_juman()
================================================
FILE: examples/userdict.csv
================================================
ペルシア語,-1,-1,-400,名詞,一般,*,*,*,*,ぺるしあご,*,*,*
================================================
FILE: install_tokenizers.sh
================================================
#!/bin/bash
os_type=`uname`
echo "os-type is "$os_type
if [ `uname` = "Darwin" ]; then
#mac用のコード
juman_utils_bin="/usr/local/opt/juman/libexec/juman/"
if [ -e ${juman_utils_bin} ]; then
:
else
juman_utils_bin="/usr/local/libexec/juman/"
fi
elif [ `uname` = "Linux" ]; then
#Linux用のコード
juman_utils_bin="/usr/local/libexec/juman/"
else
echo "Your platform ($(uname -a)) is not supported."
exit 1
fi
WORK_DIR=`pwd`
echo 'これはテスト' | mecab
is_mecab_install=$?
if [ $is_mecab_install -eq 127 ]; then
## mecab
wget -O mecab-0.996.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE"
tar zxvf mecab-0.996.tar.gz
cd mecab-0.996 && ./configure && make && make install
cd $WORK_DIR
### mecabインストール後にldconfigを実行
ldconfig
## mecab ipadic
wget -O mecab-ipadic-2.7.0-20070801.tar.gz "https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM"
tar zxvf mecab-ipadic-2.7.0-20070801.tar.gz
cd mecab-ipadic-2.7.0-20070801 &&./configure --with-charset=utf8 && make && make install
# 動作テスト
echo 'インストール後のテスト' | mecab
else
:
fi
echo 'これはテスト' | juman
is_juman_install=$?
if [ $is_juman_install -eq 127 ]; then
## juman
wget -O juman7.0.1.tar.bz2 "http://nlp.ist.i.kyoto-u.ac.jp/DLcounter/lime.cgi?down=http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2&name=juman-7.01.tar.bz2"
bzip2 -dc juman7.0.1.tar.bz2 | tar xvf -
cd juman-7.01 && ./configure && make && make install
# インストール後のldconfig
ldconfig
# 動作テスト
echo 'インストール後のテスト' | juman
else
:
fi
echo 'これはテスト' | jumanpp
is_jumanpp_install=$?
if [ $is_jumanpp_install -eq 127 ]; then
# jumanpp
wget http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.01.tar.xz
tar xJvf jumanpp-1.01.tar.xz
cd jumanpp-1.01/ && ./configure && make && make install
# todo jumanppのサーバー起動スクリプト実施
# インストール後のldconfig
ldconfig
# 動作テスト
echo 'インストール後のテスト' | jumanpp
else
:
fi
echo 'これはテスト' | kytea
is_kytea_install=$?
if [ $is_kytea_install -eq 127 ]; then
# kytea
wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz -O kytea-0.4.7.tar.gz
tar -xvf kytea-0.4.7.tar.gz
cd kytea-0.4.7 && ./configure && make && make install
# インストール後のldconfig
ldconfig
# 動作テスト
echo 'インストール後のテスト' | kytea
else
:
fi
if [ -f ./juman7.0.1.tar.bz2 ]; then
# juman
rm juman7.0.1.tar.bz2
else
:
fi
if [ -f ./mecab-*.tar.gz ]; then
# juman
rm mecab-*.tar.gz
else
:
fi
if [ -f ./mecab-ipadic-*.tar.gz ]; then
# mecab-ipadic
rm mecab-ipadic-*.tar.gz
else
:
fi
if [ -f ./jumanpp-1.01.tar.xz ]; then
# jumanpp
rm jumanpp-1.01.tar.xz
else
:
fi
if [ -f ./kytea-0.4.7.tar ]; then
# kytea
rm kytea-0.4.7.tar
else
:
fi
if [ -d ./juman-7* ]; then
# kytea
rm -rf juman-7*
else
:
fi
if [ -d ./mecab-0* ]; then
# kytea
rm -rf mecab-0*
else
:
fi
if [ -d ./mecab-ipadic-* ]; then
rm -rf mecab-ipadic-*
else
:
fi
if [ -d ./jumanpp-1.01 ]; then
rm -rf jumanpp-1.01
else
:
fi
if [ -d ./kytea-0.4.7 ]; then
rm -rf kytea-0.4.7
else
:
fi
================================================
FILE: setup.py
================================================
#! -*- coding: utf-8 -*-
from setuptools import setup, find_packages
import sys
import logging
import codecs
logger = logging.getLogger(__file__)
python_version = sys.version_info
# --------------------------------------------------------------------------------------------------------
# try to install kytea automatically because it usually causes to error during installing
try:
import Mykytea
except ImportError:
try:
import sys
import subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'kytea'])
import Mykytea
except Exception as e:
logger.error('We failed to install mykytea automatically. Try installing kytea manually.')
logger.error(e)
# --------------------------------------------------------------------------------------------------------
try:
import neologdn
except ImportError:
try:
import sys
import subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'neologdn'])
import neologdn
except Exception as e:
logger.error('We failed to install neologdn automatically because of some issues in the package. Try installing pyknp manually.')
logger.error(e)
# --------------------------------------------------------------------------------------------------------
common_packages = ['pypandoc', 'future', 'six', 'jaconv>=0.2', 'pip>=8.1.0', 'pexpect', 'pyknp>=0.4.1']
if python_version >= (3, 0, 0):
if python_version <= (3, 5, 0):
common_packages.append('typing')
elif python_version > (3, 5, 0):
common_packages.append('mecab-python3')
elif python_version <= (2, 9, 9):
common_packages.append('typing')
common_packages.append('mecab-python')
else:
raise NotImplementedError()
version = '1.6'
name = 'JapaneseTokenizer'
short_description = '`JapaneseTokenizer` is a package for easy Japanese Tokenization'
try:
import pypandoc
long_description = pypandoc.convert('README.md', 'rst')
except(IOError, ImportError):
long_description = codecs.open('README.md', 'r', 'utf-8').read()
classifiers = [
"Development Status :: 5 - Production/Stable",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python",
"Natural Language :: Japanese",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3.5"
]
setup(
author='Kensuke Mitsuzawa',
author_email='kensuke.mit@gmail.com',
name = name,
version=version,
short_description=short_description,
long_description=long_description,
keywords=['MeCab', '和布蕪', 'Juman',
'Japanese morphological analyzer', 'NLP', '形態素解析', '自然言語処理'],
license="MIT",
url = "https://github.com/Kensuke-Mitsuzawa/JapaneseTokenizers",
test_suite='test.test_all.suite',
install_requires=common_packages,
tests_require=common_packages,
packages=find_packages()
)
================================================
FILE: test/Dockerfile
================================================
FROM frolvlad/alpine-glibc:alpine-3.6
MAINTAINER kensuke-mi <kensuke.mit@gmail.com>
# Mecab install
ENV MECAB_VERSION 0.996
ENV IPADIC_VERSION 2.7.0-20070801
ENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE
ENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
ENV build_deps 'curl git bash file sudo openssh gcc make build-base'
ENV dependencies 'openssl'
ENV PATH=/opt/conda/bin:$PATH \
LANG=C.UTF-8 \
MINICONDA=Miniconda3-latest-Linux-x86_64.sh
# apk update
RUN apk update
# mecab
RUN apk add --update --no-cache ${build_deps} \
# Install dependencies
&& apk add --update --no-cache ${dependencies} \
# Install MeCab
&& curl -SL -o mecab-${MECAB_VERSION}.tar.gz ${mecab_url} \
&& tar zxf mecab-${MECAB_VERSION}.tar.gz \
&& cd mecab-${MECAB_VERSION} \
&& ./configure --enable-utf8-only --with-charset=utf8 \
&& make \
&& make install \
&& cd \
# Install IPA dic
&& curl -SL -o mecab-ipadic-${IPADIC_VERSION}.tar.gz ${ipadic_url} \
&& tar zxf mecab-ipadic-${IPADIC_VERSION}.tar.gz \
&& cd mecab-ipadic-${IPADIC_VERSION} \
&& ./configure --with-charset=utf8 \
&& make \
&& make install \
&& cd \
# Install Neologd
&& git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \
&& mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \
&& rm -rf \
mecab-${MECAB_VERSION}* \
mecab-${IPADIC_VERSION}* \
mecab-ipadic-neologd
# general
RUN apk --no-cache add vim \
wget \
lsof \
curl \
bash \
swig \
gcc \
build-base \
make \
python-dev \
py-pip \
jpeg-dev \
zlib-dev \
git \
linux-headers
ENV LIBRARY_PATH=/lib:/usr/lib
ENV PLANTUML_VERSION 1.2017.18
ENV PLANTUML_DOWNLOAD_URL https://sourceforge.net/projects/plantuml/files/plantuml.$PLANTUML_VERSION.jar/download
ENV PANDOC_VERSION 1.19.2.4
ENV PANDOC_DOWNLOAD_URL https://hackage.haskell.org/package/pandoc-$PANDOC_VERSION/pandoc-$PANDOC_VERSION.tar.gz
ENV PANDOC_ROOT /usr/local/pandoc
ENV PATH $PATH:$PANDOC_ROOT/bin
# Create Pandoc build space
RUN mkdir -p /pandoc-build
WORKDIR /pandoc-build
# Install/Build Packages
RUN apk upgrade --update && \
apk add --no-cache --virtual .build-deps $BUILD_DEPS && \
apk add --no-cache --virtual .persistent-deps $PERSISTENT_DEPS && \
curl -fsSL "$PLANTUML_DOWNLOAD_URL" -o /usr/local/plantuml.jar && \
apk add --no-cache --virtual .edge-deps $EDGE_DEPS -X http://dl-cdn.alpinelinux.org/alpine/edge/community && \
curl -fsSL "$PANDOC_DOWNLOAD_URL" | tar -xzf - && \
( cd pandoc-$PANDOC_VERSION && cabal update && cabal install --only-dependencies && \
cabal configure --prefix=$PANDOC_ROOT && \
cabal build && \
cabal copy && \
cd .. ) && \
rm -Rf pandoc-$PANDOC_VERSION/ && \
rm -Rf /root/.cabal/ /root/.ghc/ && \
rmdir /pandoc-build && \
set -x; \
addgroup -g 82 -S www-data; \
adduser -u 82 -D -S -G www-data www-data && \
mkdir -p /var/docs && \
apk del .build-deps .edge-deps
# Juman
RUN wget http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2 \
&& tar xvf juman-7.01.tar.bz2 \
&& cd juman-7.01 \
&& ./configure \
&& make \
&& make install \
&& cd .. \
&& rm -rf juman-7.01 \
&& rm juman-7.01.tar.bz2
# Juman++
RUN apk add --update --no-cache --virtual=build-deps \
boost-dev g++ make \
&& wget -q http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz \
&& tar Jxfv jumanpp-1.02.tar.xz \
&& cd jumanpp-1.02/ \
&& ./configure \
&& make \
&& make install \
&& cd .. \
&& rm jumanpp-1.02.tar.xz \
&& rm -rf /var/cache/* \
&& apk del build-deps \
&& apk add --update --no-cache boost
# kytea
RUN wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz \
&& tar -xvf kytea-0.4.7.tar.gz \
&& cd kytea-0.4.7 \
&& ./configure \
&& make \
&& make install
# Python
RUN apk add --no-cache bash wget && \
wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \
bash $MINICONDA -b -p /opt/conda && \
ln -s /opt/conda/bin/* /usr/local/bin/ && \
rm -rf /root/.[acpw]* $MINICONDA /opt/conda/pkgs/*
RUN conda config --add channels conda-forge --system
RUN conda create -y -n p27 python=2.7
RUN conda create -y -n p36 python=3.6
RUN conda create -y -n p37 python=3.7
#RUN source activate p27
#RUN source deactivate
CMD ["/bin/bash"]
================================================
FILE: test/Dockerfile-dev
================================================
FROM frolvlad/alpine-glibc:alpine-3.6
MAINTAINER kensuke-mi <kensuke.mit@gmail.com>
# Mecab install
ENV MECAB_VERSION 0.996
ENV IPADIC_VERSION 2.7.0-20070801
ENV mecab_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE
ENV ipadic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM
ENV jumandic_url https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM
ENV unidic_url https://unidic.ninjal.ac.jp/unidic_archive/cwj/2.3.0/unidic-cwj-2.3.0.zip
ENV build_deps 'curl git bash file sudo openssh gcc make build-base'
ENV dependencies 'openssl'
ENV PATH=/opt/conda/bin:$PATH \
LANG=C.UTF-8 \
MINICONDA=Miniconda3-latest-Linux-x86_64.sh
# apk update
RUN apk update
# mecab
RUN apk add --update --no-cache ${build_deps} \
# Install dependencies
&& apk add --update --no-cache ${dependencies} \
# Install MeCab
&& curl -SL -o mecab-${MECAB_VERSION}.tar.gz ${mecab_url} \
&& tar zxf mecab-${MECAB_VERSION}.tar.gz \
&& cd mecab-${MECAB_VERSION} \
&& ./configure --enable-utf8-only --with-charset=utf8 \
&& make \
&& make install \
&& cd \
# Install IPA dic
&& curl -SL -o mecab-ipadic-${IPADIC_VERSION}.tar.gz ${ipadic_url} \
&& tar zxf mecab-ipadic-${IPADIC_VERSION}.tar.gz \
&& cd mecab-ipadic-${IPADIC_VERSION} \
&& ./configure --with-charset=utf8 \
&& make \
&& make install \
&& cd \
# Install Neologd
&& git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git \
&& mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n -y \
# Install jumandic
&& curl -SL -o jumandic.tar.gz ${jumandic_url} \
&& tar zxf jumandic.tar.gz \
&& cd mecab-jumandic-7.0-20130310 \
&& ./configure --with-charset=utf8 \
&& make \
&& make install \
# delete dictionary files
&& cd \
&& rm -rf \
mecab-${MECAB_VERSION}* \
mecab-${IPADIC_VERSION}* \
mecab-ipadic-neologd \
mecab-jumandic-7.0-20130310
# general
RUN apk --no-cache add vim \
wget \
lsof \
curl \
bash \
swig \
gcc \
build-base \
make \
python-dev \
py-pip \
jpeg-dev \
zlib-dev \
git \
linux-headers
ENV LIBRARY_PATH=/lib:/usr/lib
ENV PLANTUML_VERSION 1.2017.18
ENV PLANTUML_DOWNLOAD_URL https://sourceforge.net/projects/plantuml/files/plantuml.$PLANTUML_VERSION.jar/download
ENV PANDOC_VERSION 1.19.2.4
ENV PANDOC_DOWNLOAD_URL https://hackage.haskell.org/package/pandoc-$PANDOC_VERSION/pandoc-$PANDOC_VERSION.tar.gz
ENV PANDOC_ROOT /usr/local/pandoc
ENV PATH $PATH:$PANDOC_ROOT/bin
# Create Pandoc build space
RUN mkdir -p /pandoc-build
WORKDIR /pandoc-build
# Install/Build Packages
RUN apk upgrade --update && \
apk add --no-cache --virtual .build-deps $BUILD_DEPS && \
apk add --no-cache --virtual .persistent-deps $PERSISTENT_DEPS && \
curl -fsSL "$PLANTUML_DOWNLOAD_URL" -o /usr/local/plantuml.jar && \
apk add --no-cache --virtual .edge-deps $EDGE_DEPS -X http://dl-cdn.alpinelinux.org/alpine/edge/community && \
curl -fsSL "$PANDOC_DOWNLOAD_URL" | tar -xzf - && \
( cd pandoc-$PANDOC_VERSION && cabal update && cabal install --only-dependencies && \
cabal configure --prefix=$PANDOC_ROOT && \
cabal build && \
cabal copy && \
cd .. ) && \
rm -Rf pandoc-$PANDOC_VERSION/ && \
rm -Rf /root/.cabal/ /root/.ghc/ && \
rmdir /pandoc-build && \
set -x; \
addgroup -g 82 -S www-data; \
adduser -u 82 -D -S -G www-data www-data && \
mkdir -p /var/docs && \
apk del .build-deps .edge-deps
# Juman
RUN wget http://nlp.ist.i.kyoto-u.ac.jp/nl-resource/juman/juman-7.01.tar.bz2 \
&& tar xvf juman-7.01.tar.bz2 \
&& cd juman-7.01 \
&& ./configure \
&& make \
&& make install \
&& cd .. \
&& rm -rf juman-7.01 \
&& rm juman-7.01.tar.bz2
# Juman++
RUN apk add --update --no-cache --virtual=build-deps \
boost-dev g++ make \
&& wget -q http://lotus.kuee.kyoto-u.ac.jp/nl-resource/jumanpp/jumanpp-1.02.tar.xz \
&& tar Jxfv jumanpp-1.02.tar.xz \
&& cd jumanpp-1.02/ \
&& ./configure \
&& make \
&& make install \
&& cd .. \
&& rm jumanpp-1.02.tar.xz \
&& rm -rf /var/cache/* \
&& apk del build-deps \
&& apk add --update --no-cache boost
# kytea
RUN wget http://www.phontron.com/kytea/download/kytea-0.4.7.tar.gz \
&& tar -xvf kytea-0.4.7.tar.gz \
&& cd kytea-0.4.7 \
&& ./configure \
&& make \
&& make install
# Python
RUN apk add --no-cache bash wget && \
wget -q --no-check-certificate https://repo.continuum.io/miniconda/$MINICONDA && \
bash $MINICONDA -b -p /opt/conda && \
ln -s /opt/conda/bin/* /usr/local/bin/ && \
rm -rf /root/.[acpw]* $MINICONDA /opt/conda/pkgs/*
RUN conda config --add channels conda-forge --system
RUN conda create -y -n p27 python=2.7
RUN conda create -y -n p37 python=3.7
RUN mkdir /code
RUN mkdir /code/dev
COPY requirements_py2.txt /code/dev/requirements_py2.txt
COPY requirements_py3.txt /code/dev/requirements_py3.txt
RUN source activate p27 && pip install -r /code/dev/requirements_py2.txt
RUN source deactivate
RUN source activate p37 && pip install -r /code/dev/requirements_py3.txt
RUN source deactivate
CMD ["/bin/bash"]
================================================
FILE: test/__init__.py
================================================
__author__ = 'kensuke-mi'
================================================
FILE: test/common/__init__.py
================================================
================================================
FILE: test/common/test_server_handler.py
================================================
#! -*- coding: utf-8 -*-
# test module
from JapaneseTokenizer.common import sever_handler
# client module
import six
if six.PY2:
from JapaneseTokenizer.jumanpp_wrapper.__jumanpp_wrapper_python2 import JumanppWrapper
else:
from JapaneseTokenizer.jumanpp_wrapper.__jumanpp_wrapper_python3 import JumanppWrapper
# else
import sys
import unittest
import os
import time
__author__ = 'kensuke-mi'
class TestServerHandler(unittest.TestCase):
@classmethod
def setUpClass(cls):
if six.PY3:
cls.test_senetence = '紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優。'
else:
cls.test_senetence = u'紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優。'
cls.jumanpp_command = "/usr/local/bin/jumanpp"
def test_jumanpp_process_hanlder_normal(self):
"""It tests jumanpp process handler"""
# normal test #
jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command)
result_jumanpp_analysis = jumanpp_process_handler.query(input_string=self.test_senetence)
self.assertTrue(isinstance(result_jumanpp_analysis,six.text_type))
## stop process ##
jumanpp_process_handler.stop_process()
## delete instance ##
del jumanpp_process_handler
def test_jumanpp_process_handler_timeout_exception(self):
"""It tests the case which causes timeout exception"""
with self.assertRaises(Exception) as exc:
jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command,
timeout_second=1)
result_jumanpp_analysis = jumanpp_process_handler.query(input_string=self.test_senetence*100)
exception_message = exc.exception
jumanpp_process_handler.stop_process()
def test_jumanpp_process_handler_init_exception(self):
with self.assertRaises(Exception) as exc:
jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command='hoge',
timeout_second=1)
exception_message = exc.exception
def test_jumanpp_process_handler_huge_request(self):
"""It tests the case where a user sends too much request"""
input_huge_request = [self.test_senetence] * 100
jumanpp_process_handler = sever_handler.JumanppHnadler(jumanpp_command=self.jumanpp_command)
seq_result_jumanpp_analysis = [jumanpp_process_handler.query(input_string=sentence)
for sentence in input_huge_request]
self.assertTrue(isinstance(seq_result_jumanpp_analysis, list))
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/docker-compose-dev.yml
================================================
# 開発/test環境としてまとめてdocker環境を整えるためのcompose
version: '3'
services:
dev_env_py2:
build:
context: ./
dockerfile: Dockerfile-dev
volumes:
- ..:/codes/
stdin_open: true
tty: true
command: bash -c "source /opt/conda/bin/activate p27 && pip install -r requirements_py2.txt"
dev_env_py3:
build:
context: ./
dockerfile: Dockerfile
volumes:
- ..:/codes/
stdin_open: true
tty: true
command: bash -c "source /opt/conda/bin/activate p37 && pip install -r requirements_py3.txt"
================================================
FILE: test/docker-compose.yml
================================================
# 開発/test環境としてまとめてdocker環境を整えるためのcompose
version: '3'
services:
test_env:
build:
context: ./
dockerfile: Dockerfile
volumes:
- ..:/codes/
stdin_open: true
tty: true
command: bash -c "juman -S && source /opt/conda/bin/activate p37 && cd /codes/ && python setup.py test && source deactivate && echo 'Python3 test done' && source /opt/conda/bin/activate p27 && cd /codes/ && python setup.py test && echo 'Python2 test done'"
================================================
FILE: test/requirements_py2.txt
================================================
pypandoc
future
six
jaconv>=0.2
pip>=8.1.0
pexpect
pyknp>=0.4.1
mecab-python
typing
neologdn
kytea
================================================
FILE: test/requirements_py3.txt
================================================
pypandoc
future
six
jaconv>=0.2
pip>=8.1.0
pexpect
pyknp
mecab-python3
neologdn
kytea
================================================
FILE: test/resources/test/userdict.csv
================================================
さくらまな,-1,-1,-400,名詞,一般,*,*,*,*,さくらまな,*,*,*
================================================
FILE: test/test_all.py
================================================
__author__ = 'kensuke-mi'
import sys
import unittest
import six
python_version = sys.version_info
def suite():
suite = unittest.TestSuite()
if six.PY3:
from .test_filter_python3 import TestFilter
from .test_mecab_wrapper_python3 import TestMecabWrapperPython3
from .test_kytea_wrapper_python3 import TestKyteaWrapperPython3
from .test_juman_wrapper_python3 import TestJumanWrapperPython3
suite.addTest(unittest.makeSuite(TestFilter))
suite.addTest(unittest.makeSuite(TestKyteaWrapperPython3))
suite.addTest(unittest.makeSuite(TestMecabWrapperPython3))
suite.addTest(unittest.makeSuite(TestJumanWrapperPython3))
elif six.PY2:
from .test_filter_python2 import TestFilter
from .test_mecab_wrapper_python2 import TestMecabWrapperPython2
from .test_juman_wrapper_python2 import TestJumanWrapperPython2
from .test_kytea_wrapper_python2 import TestKyteaWrapperPython2
suite.addTest(unittest.makeSuite(TestFilter))
suite.addTest(unittest.makeSuite(TestKyteaWrapperPython2))
suite.addTest(unittest.makeSuite(TestMecabWrapperPython2))
suite.addTest(unittest.makeSuite(TestJumanWrapperPython2))
return suite
def suite_with_jumanpp():
suite_obj = suite()
if six.PY3:
from .test_jumanpp_wrapper_python3 import TestJumanppWrapperPython3
suite_obj.addTest(suite_obj.addTest(unittest.makeSuite(TestJumanppWrapperPython3)))
elif six.PY2:
from .test_jumanpp_wrapper_python2 import TestJumanppWrapperPython2
suite_obj.addTest(suite_obj.addTest(unittest.makeSuite(TestJumanppWrapperPython2)))
return suite_obj
================================================
FILE: test/test_filter_python2.py
================================================
#! -*- coding: utf-8 -*-
import sys
import unittest
from JapaneseTokenizer.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence, FilteredObject, TokenizedResult
import os
__author__ = 'kensuke-mi'
class TestFilter(unittest.TestCase):
def setUp(self):
'''紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優みたいだ。'''
self.test_senetence = u'紗倉 まなは、日本のAV女優みたいで、うつくしい。\nそこで、ぼくはその1枚のはなやかな作品を見たいと思った。'
self.stopword = ['AV']
self.pos_condition = [('名詞', '一般',), ('名詞', '固有名詞'), ('形容詞', '自立',), ('助詞', '格助詞', '引用')]
self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')
def test_filtering(self):
mecab_obj = MecabWrapper(dictType='ipadic')
tokenized_sentence = mecab_obj.tokenize(sentence=self.test_senetence,is_feature=True).\
filter(pos_condition=self.pos_condition, stopwords=self.stopword)
assert isinstance(tokenized_sentence, TokenizedSenetence)
seq_except_pos = [(u'動詞',), (u'名詞', u'代名詞'), (u'名詞', u'接尾')]
seq_match_pos = [(u'名詞',), (u'名詞', u'固有名詞',), (u'形容詞',), (u'形容詞', u'自立'),(u'助詞', u'格助詞', u'引用')]
for token_obj in tokenized_sentence.tokenized_objects:
assert isinstance(token_obj, TokenizedResult)
pos_tuple = token_obj.tuple_pos
# 結果に入っているべきではない品詞 #
for except_pos in seq_except_pos:
self.assertTrue(not set(except_pos).issubset(set(pos_tuple)))
# 結果に入っているべき品詞 #
bool_any = any(set(match_pos).issubset(set(pos_tuple)) for match_pos in seq_match_pos)
self.assertTrue(bool_any)
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/test_filter_python3.py
================================================
#! -*- coding: utf-8 -*-
import sys
import unittest
from JapaneseTokenizer.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence, FilteredObject, TokenizedResult
import os
__author__ = 'kensuke-mi'
class TestFilter(unittest.TestCase):
def setUp(self):
'''紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優みたいだ。'''
self.test_senetence = '紗倉 まなは、日本のAV女優みたいで、うつくしい。そこで、ぼくはその1枚のはなやかな作品を見たいと思った。'
self.stopword = ['AV', '女優']
self.pos_condition = [('名詞', '一般',), ('名詞', '固有名詞'), ('形容詞', '自立',), ('助詞', '格助詞', '引用')]
self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')
def test_filtering(self):
mecab_obj = MecabWrapper(dictType='ipadic')
tokenized_sentence = mecab_obj.tokenize(sentence=self.test_senetence,is_feature=True).\
filter(pos_condition=self.pos_condition, stopwords=self.stopword)
assert isinstance(tokenized_sentence, TokenizedSenetence)
seq_except_pos = [('動詞',), ('名詞', '代名詞'), ('名詞', '接尾')]
seq_match_pos = [('名詞',), ('名詞', '固有名詞',), ('形容詞',), ('形容詞', '自立'),('助詞', '格助詞', '引用')]
for token_obj in tokenized_sentence.tokenized_objects:
assert isinstance(token_obj, TokenizedResult)
pos_tuple = token_obj.tuple_pos
# 結果に入っているべきではない品詞 #
for except_pos in seq_except_pos:
self.assertTrue(not set(except_pos).issubset(set(pos_tuple)))
# 結果に入っているべき品詞 #
bool_any = any(set(match_pos).issubset(set(pos_tuple)) for match_pos in seq_match_pos)
self.assertTrue(bool_any)
# stopwordsのチェック
self.assertTrue(token_obj.word_stem not in self.stopword)
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/test_juman_wrapper_python2.py
================================================
#-*- encoding: utf-8 -*-
# this test file does not work under pycharm
# do your test with command line
from __future__ import absolute_import
from __future__ import division
from future.utils import string_types, text_type
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
from JapaneseTokenizer.juman_wrapper import JumanWrapper
import pyknp
import unittest
import sys
import codecs
import logging
sys.stdin = codecs.getreader('utf_8')(sys.stdin)
sys.stdout = codecs.getwriter('utf_8')(sys.stdout)
logger = logging.getLogger(__file__)
logger.level = logging.INFO
class TestJumanWrapperPython2(unittest.TestCase):
def setUp(self):
pass
def test_juman_wrapper(self):
try:
from pyknp import Juman
juman = Juman(command='juman', jumanpp=False)
result = juman.analysis(u"これはペンです。")
logger.debug(','.join(mrph.midasi for mrph in result))
for mrph in result.mrph_list():
assert isinstance(mrph, pyknp.Morpheme)
logger.debug(u"見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
% (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
except ImportError:
logger.debug('skip test_juman_wrapper')
def test_tokenize(self):
"""This test case checks juman_wrapper.tokenize
"""
logger.debug (u'Tokenize Test')
test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
juman_wrapper = JumanWrapper()
token_objects = juman_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True)
assert isinstance(token_objects, TokenizedSenetence)
for t_obj in token_objects.tokenized_objects:
assert isinstance(t_obj, TokenizedResult)
logger.debug(u"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format(
t_obj.word_surface,
t_obj.word_stem,
' '.join(t_obj.tuple_pos),
t_obj.misc_info
))
assert isinstance(t_obj.word_surface, string_types)
assert isinstance(t_obj.word_stem, string_types)
assert isinstance(t_obj.tuple_pos, tuple)
assert isinstance(t_obj.misc_info, dict)
token_objects_list = token_objects.convert_list_object()
assert isinstance(token_objects_list, list)
logger.debug('-'*30)
for stem_posTuple in token_objects_list:
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, string_types)
assert isinstance(word_posTuple, tuple)
logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))
def test_filter_pos(self):
"""
"""
logger.debug (u'Filtering Test. POS condition is only 名詞')
test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
juman_wrapper = JumanWrapper()
token_objects = juman_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True
)
pos_condition = [(u'名詞', )]
filtered_result = juman_wrapper.filter(
parsed_sentence=token_objects,
pos_condition=pos_condition
)
assert isinstance(filtered_result, FilteredObject)
for t_obj in filtered_result.tokenized_objects:
assert isinstance(t_obj, TokenizedResult)
logger.debug(u"word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format(
t_obj.word_surface,
t_obj.word_stem,
' '.join(t_obj.tuple_pos),
t_obj.misc_info
))
assert isinstance(t_obj.word_surface, string_types)
assert isinstance(t_obj.word_stem, string_types)
assert isinstance(t_obj.tuple_pos, tuple)
assert isinstance(t_obj.misc_info, dict)
assert t_obj.tuple_pos[0] == u'名詞'
logger.debug('-'*30)
for stem_posTuple in filtered_result.convert_list_object():
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, string_types)
assert isinstance(word_posTuple, tuple)
logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))
def test_stopwords(self):
stopword = [u'AV', u'女優']
logger.debug (u'Stopwords Filtering Test. Stopwords is {}'.format(u','.join(stopword)))
test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
juman_wrapper = JumanWrapper()
token_objects = juman_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True
)
filtered_result = juman_wrapper.filter(
parsed_sentence=token_objects,
stopwords=stopword
)
check_flag = True
for stem_posTuple in filtered_result.convert_list_object():
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, string_types)
assert isinstance(word_posTuple, tuple)
logger.debug(u'word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))
if word_stem in stopword: check_flag = False
assert check_flag
def test_juman_server_mode(self):
### test with server mode ###
### Attention: this method causes Error if you don't start JUMAN SERVER mode ###
test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
juman_wrapper = JumanWrapper(server='localhost', port=32000)
token_objects = juman_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True)
self.assertTrue(isinstance(token_objects, TokenizedSenetence))
list_tokens = juman_wrapper.tokenize(sentence=test_sentence,
return_list=True,
is_feature=True)
self.assertTrue(isinstance(list_tokens, list))
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/test_juman_wrapper_python3.py
================================================
#-*- encoding: utf-8 -*-
# this test file does not work under pycharm
# do your test with command line
from pyknp import Juman
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
from JapaneseTokenizer.juman_wrapper import JumanWrapper
import pyknp
import unittest
import os
import logging
import socket
logger = logging.getLogger(__file__)
logger.level = logging.INFO
class TestJumanWrapperPython3(unittest.TestCase):
def setUp(self):
# this is under MacOSX10
self.path_to_juman_command = '/usr/local/bin/juman'
if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'juman'
def test_juman_wrapper(self):
try:
juman = Juman(command=self.path_to_juman_command)
result = juman.analysis("これはペンです。")
logger.debug(','.join(mrph.midasi for mrph in result))
for mrph in result.mrph_list():
assert isinstance(mrph, pyknp.Morpheme)
logger.debug("見出し:%s, 読み:%s, 原形:%s, 品詞:%s, 品詞細分類:%s, 活用型:%s, 活用形:%s, 意味情報:%s, 代表表記:%s" \
% (mrph.midasi, mrph.yomi, mrph.genkei, mrph.hinsi, mrph.bunrui, mrph.katuyou1, mrph.katuyou2, mrph.imis, mrph.repname))
except ImportError:
print('skip test_juman_wrapper')
def test_tokenize(self):
"""This test case checks juman_wrapper.tokenize
"""
logger.debug('Tokenize Test')
test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
juman_wrapper = JumanWrapper(command=self.path_to_juman_command)
token_objects = juman_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True)
assert isinstance(token_objects, TokenizedSenetence)
for t_obj in token_objects.tokenized_objects:
assert isinstance(t_obj, TokenizedResult)
logger.debug("word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format(
t_obj.word_surface,
t_obj.word_stem,
' '.join(t_obj.tuple_pos),
t_obj.misc_info
))
assert isinstance(t_obj.word_surface, str)
assert isinstance(t_obj.word_stem, str)
assert isinstance(t_obj.tuple_pos, tuple)
assert isinstance(t_obj.misc_info, dict)
token_objects_list = token_objects.convert_list_object()
assert isinstance(token_objects_list, list)
logger.debug('-'*30)
for stem_posTuple in token_objects_list:
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, str)
assert isinstance(word_posTuple, tuple)
logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))
def test_filter_pos(self):
"""POS filteringのテスト
"""
logger.debug('Filtering Test. POS condition is only 名詞')
test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
juman_wrapper = JumanWrapper(command=self.path_to_juman_command)
token_objects = juman_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True)
pos_condition = [('名詞', )]
filtered_result = juman_wrapper.filter(
parsed_sentence=token_objects,
pos_condition=pos_condition
)
assert isinstance(filtered_result, FilteredObject)
for t_obj in filtered_result.tokenized_objects:
assert isinstance(t_obj, TokenizedResult)
logger.debug("word_surafce:{}, word_stem:{}, pos_tuple:{}, misc_info:{}".format(
t_obj.word_surface,
t_obj.word_stem,
' '.join(t_obj.tuple_pos),
t_obj.misc_info
))
assert isinstance(t_obj.word_surface, str)
assert isinstance(t_obj.word_stem, str)
assert isinstance(t_obj.tuple_pos, tuple)
assert isinstance(t_obj.misc_info, dict)
assert t_obj.tuple_pos[0] == '名詞'
logger.debug('-'*30)
for stem_posTuple in filtered_result.convert_list_object():
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, str)
assert isinstance(word_posTuple, tuple)
logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))
def test_stopwords(self):
"""stopword除去のテスト"""
stopword = ['AV', '女優']
logger.debug ('Stopwords Filtering Test. Stopwords is {}'.format(','.join(stopword)))
test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
juman_wrapper = JumanWrapper(command=self.path_to_juman_command)
token_objects = juman_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True
)
filtered_result = juman_wrapper.filter(
parsed_sentence=token_objects,
stopwords=stopword
)
check_flag = True
for stem_posTuple in filtered_result.convert_list_object():
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, str)
assert isinstance(word_posTuple, tuple)
logger.debug('word_stem:{} word_pos:{}'.format(word_stem, ' '.join(word_posTuple)))
if word_stem in stopword: check_flag = False
assert check_flag
def test_juman_severmode(self):
"""* What you can do
- juman server modeのテストを実施する
"""
logger.debug('Tokenize test with server mode')
test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
# check socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
HOST = 'localhost'
PORT = 32000
try:
s.connect((HOST, PORT))
s.close()
except:
logger.warning("SKip server mode test because server is not working.")
else:
juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT)
token_objects = juman_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True)
assert isinstance(token_objects, TokenizedSenetence)
test_sentence = "ペルシア語(ペルシアご、ペルシア語: فارسی, پارسی; Fārsī, Pārsī)は、イランを中心とする中東地域で話される言語。"
juman_wrapper = JumanWrapper(command=self.path_to_juman_command, server=HOST, port=PORT)
list_token = juman_wrapper.tokenize(sentence=test_sentence,
return_list=True,
is_feature=True)
assert isinstance(list_token, list)
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/test_jumanpp_wrapper_python2.py
================================================
#-*- encoding: utf-8 -*-
# this test file does not work under pycharm
# do your test with command line
from pyknp import Juman
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
from JapaneseTokenizer.jumanpp_wrapper.jumanpp_wrapper import JumanppWrapper, JumanppClient
from JapaneseTokenizer.common.sever_handler import JumanppHnadler
import pyknp
import socket
import unittest
import os
import logging
logger = logging.getLogger(__file__)
logger.level = logging.INFO
class TestJumanppWrapperPython2(unittest.TestCase):
def setUp(self):
# this is under MacOSX10
self.path_to_juman_command = '/usr/local/bin/jumanpp'
if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'jumanpp'
def test_JumanppClient(self):
test_sentence = u'外国人参政権を欲しい。'
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
HOST = 'localhost'
PORT = 12000
try:
s.connect((HOST, PORT))
s.close()
except:
logger.warning("SKip server mode test because server is not working.")
else:
client_obj = JumanppClient(hostname='localhost', port=12000)
res = client_obj.query(sentence=test_sentence, pattern=r'EOS')
del res
def test_jumanpp_servermode(self):
### test with list return object ###
test_sentence = u'外国人参政権を欲しい。'
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
HOST = 'localhost'
PORT = 12000
try:
s.connect((HOST, PORT))
s.close()
except:
logger.warning("SKip server mode test because server is not working.")
else:
jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000)
list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
assert isinstance(list_tokens, list)
### test with TokenizedSenetence return object ###
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
assert isinstance(tokenized_obj, TokenizedSenetence)
### test with TokenizedSenetence return object and filter by chain expression ###
pos_condtion = [('名詞', )]
filtered_res = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False).filter(pos_condition=pos_condtion)
assert isinstance(filtered_res, FilteredObject)
assert isinstance(filtered_res.convert_list_object(), list)
def test_jumanpp_servermode_stress(self):
### test with severmode with much stress ###
test_sentence = u'外国人参政権を欲しい。'
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
HOST = 'localhost'
PORT = 12000
try:
s.connect((HOST, PORT))
s.close()
except:
logger.warning("SKip server mode test because server is not working.")
else:
jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000)
for i in range(0, 1000):
list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
assert isinstance(list_tokens, list)
assert u'外国' in test_sentence
del jumanpp_tokenizer
def test_jumanpp_localmode_pyexpect(self):
test_sentence = u'外国人参政権を欲しい。'
jumanpp_tokenizer = JumanppWrapper(command=self.path_to_juman_command, is_use_pyknp=False)
self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
assert isinstance(list_tokens, list)
jumanpp_tokenizer = JumanppWrapper(command=self.path_to_juman_command, is_use_pyknp=False)
self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
assert isinstance(tokenized_obj, TokenizedSenetence)
def test_jumanpp_huge_amount_text(self):
"""pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動"""
logger.info('under testing of processing huge amount of text...')
seq_test_sentence = [u'外国人参政権を欲しい。'] * 500
jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
for i, test_s in enumerate(seq_test_sentence):
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s)
self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence))
if not i == 0 and i % 100 == 0:
"""強制的にプロセスを殺して再起動"""
logger.info('It forces stop unix process.')
jumanpp_tokenizer.jumanpp_obj.restart_process()
else:
pass
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/test_jumanpp_wrapper_python3.py
================================================
#-*- encoding: utf-8 -*-
# this test file does not work under pycharm
# do your test with command line
from pyknp import Juman
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
from JapaneseTokenizer.jumanpp_wrapper.jumanpp_wrapper import JumanppWrapper, JumanppClient
from JapaneseTokenizer.common.sever_handler import JumanppHnadler
import pyknp
import unittest
import os
import logging
import socket
logger = logging.getLogger(__file__)
logger.level = logging.INFO
class TestJumanppWrapperPython3(unittest.TestCase):
def setUp(self):
# this is under MacOSX10
self.path_to_juman_command = '/usr/local/bin/jumanpp'
if not os.path.exists(self.path_to_juman_command): self.path_to_juman_command = 'jumanpp'
def test_JumanppClient(self):
test_sentence = '外国人参政権を欲しい。'
# check socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
HOST = 'localhost'
PORT = 12000
try:
s.connect((HOST, PORT))
s.close()
except:
logger.warning("SKip server mode test because server is not working.")
else:
jumanpp_tokenizer = JumanppWrapper(server=HOST, port=PORT)
client_obj = JumanppClient(hostname='localhost', port=12000)
res = client_obj.query(sentence=test_sentence, pattern=rb'EOS')
del res
def test_jumanpp_servermode(self):
### test with list return object ###
test_sentence = '外国人参政権を欲しい。'
# check socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
HOST = 'localhost'
PORT = 12000
try:
s.connect((HOST, PORT))
s.close()
except:
logger.warning(msg='SKip server mode test because server is not working.')
else:
jumanpp_tokenizer = JumanppWrapper(server=HOST, port=PORT)
list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
assert isinstance(list_tokens, list)
### test with TokenizedSenetence return object ###
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
assert isinstance(tokenized_obj, TokenizedSenetence)
### test with TokenizedSenetence return object and filter by chain expression ###
pos_condtion = [('名詞',)]
filtered_res = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False).filter(
pos_condition=pos_condtion)
assert isinstance(filtered_res, FilteredObject)
assert isinstance(filtered_res.convert_list_object(), list)
def test_jumanpp_servermode_stress(self):
### test with severmode with much stress ###
test_sentence = '外国人参政権を欲しい。'
# check socket
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
HOST = 'localhost'
PORT = 12000
try:
s.connect((HOST, PORT))
s.close()
except:
logger.warning(msg='SKip server mode test because server is not working.')
else:
jumanpp_tokenizer = JumanppWrapper(server='localhost', port=12000)
for i in range(0, 1000):
list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
assert isinstance(list_tokens, list)
assert '外国' in test_sentence
del jumanpp_tokenizer
def test_jumanpp_localmode_pyexpect(self):
"""pexpectを使ったプロセス呼び出しのテスト"""
test_sentence = '外国人参政権を欲しい。'
jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
list_tokens = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=True)
assert isinstance(list_tokens, list)
jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_sentence, return_list=False)
assert isinstance(tokenized_obj, TokenizedSenetence)
def test_jumanpp_huge_amount_text(self):
"""pexpectを利用した大量テキスト処理 & テキスト処理中のプロセス再起動"""
logger.info('under testing of processing huge amount of text...')
seq_test_sentence = ['外国人参政権を欲しい。'] * 500
jumanpp_tokenizer = JumanppWrapper(is_use_pyknp=False, command=self.path_to_juman_command)
self.assertTrue(isinstance(jumanpp_tokenizer.jumanpp_obj, JumanppHnadler))
for i, test_s in enumerate(seq_test_sentence):
tokenized_obj = jumanpp_tokenizer.tokenize(sentence=test_s)
self.assertTrue(isinstance(tokenized_obj, TokenizedSenetence))
if not i == 0 and i % 100 == 0:
"""強制的にプロセスを殺して再起動"""
logger.info('It forces stop unix process.')
jumanpp_tokenizer.jumanpp_obj.restart_process()
else:
pass
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/test_kytea_wrapper_python2.py
================================================
# -*- coding: utf-8 -*-
from JapaneseTokenizer.kytea_wrapper import KyteaWrapper
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
import unittest
class TestKyteaWrapperPython2(unittest.TestCase):
def setUp(self):
pass
def test_tokenization(self):
input_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
kytea_wrapper = KyteaWrapper()
tokenized_result = kytea_wrapper.tokenize(
sentence=input_sentence,
normalize=True,
return_list=False,
is_feature=True
)
assert isinstance(tokenized_result, TokenizedSenetence)
for t_obj in tokenized_result.tokenized_objects:
assert isinstance(t_obj, TokenizedResult)
print('-'*30)
tokenized_result_list = tokenized_result.convert_list_object()
assert isinstance(tokenized_result_list, list)
for t_obj_tuple in tokenized_result_list:
assert isinstance(t_obj_tuple, tuple)
def test_filter_pos(self):
"""
"""
print (u'Filtering Test. POS condition is only 名詞')
test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
kytea_wrapper = KyteaWrapper()
tokenized_result = kytea_wrapper.tokenize(
sentence=test_sentence,
normalize=True,
return_list=False,
is_feature=True
)
pos_condition = [(u'名詞', )]
filtered_result = kytea_wrapper.filter(
parsed_sentence=tokenized_result,
pos_condition=pos_condition
)
assert isinstance(filtered_result, FilteredObject)
for t_obj in filtered_result.tokenized_objects:
assert isinstance(t_obj, TokenizedResult)
assert isinstance(t_obj.word_surface, unicode)
assert isinstance(t_obj.word_stem, unicode)
assert isinstance(t_obj.tuple_pos, tuple)
assert isinstance(t_obj.misc_info, dict)
assert t_obj.tuple_pos[0] == u'名詞'
print('-'*30)
for stem_posTuple in filtered_result.convert_list_object():
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, unicode)
assert isinstance(word_posTuple, tuple)
def test_stopwords(self):
stopword = [u'女優']
print (u'Stopwords Filtering Test. Stopwords is {}'.format(u','.join(stopword)))
test_sentence = u"紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
kytea_wrapper = KyteaWrapper()
token_objects = kytea_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True
)
filtered_result = kytea_wrapper.filter(
parsed_sentence=token_objects,
stopwords=stopword
)
check_flag = True
print('-'*30)
for stem_posTuple in filtered_result.convert_list_object():
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, unicode)
assert isinstance(word_posTuple, tuple)
if word_stem in stopword:
check_flag = False
assert check_flag
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/test_kytea_wrapper_python3.py
================================================
# -*- coding: utf-8 -*-
from JapaneseTokenizer.kytea_wrapper import KyteaWrapper
from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence, FilteredObject
import unittest
class TestKyteaWrapperPython3(unittest.TestCase):
def setUp(self):
pass
def test_tokenization(self):
input_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
kytea_wrapper = KyteaWrapper()
tokenized_result = kytea_wrapper.tokenize(
sentence=input_sentence,
normalize=True,
return_list=False,
is_feature=True
)
assert isinstance(tokenized_result, TokenizedSenetence)
for t_obj in tokenized_result.tokenized_objects:
assert isinstance(t_obj, TokenizedResult)
#print('-'*30)
tokenized_result_list = tokenized_result.convert_list_object()
assert isinstance(tokenized_result_list, list)
for t_obj_tuple in tokenized_result_list:
assert isinstance(t_obj_tuple, tuple)
def test_filter_pos(self):
"""
"""
# 'Filtering Test. POS condition is only 名詞')
test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
kytea_wrapper = KyteaWrapper()
tokenized_result = kytea_wrapper.tokenize(
sentence=test_sentence,
normalize=True,
return_list=False,
is_feature=True
)
pos_condition = [('名詞', )]
filtered_result = kytea_wrapper.filter(
parsed_sentence=tokenized_result,
pos_condition=pos_condition
)
assert isinstance(filtered_result, FilteredObject)
for t_obj in filtered_result.tokenized_objects:
assert isinstance(t_obj, TokenizedResult)
assert isinstance(t_obj.word_surface, str)
assert isinstance(t_obj.word_stem, str)
assert isinstance(t_obj.tuple_pos, tuple)
assert isinstance(t_obj.misc_info, dict)
assert t_obj.tuple_pos[0] == '名詞'
for stem_posTuple in filtered_result.convert_list_object():
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, str)
assert isinstance(word_posTuple, tuple)
def test_stopwords(self):
stopword = ['女優']
# ('Stopwords Filtering Test. Stopwords is {}'.format(','.join(stopword)))
test_sentence = "紗倉 まな(さくら まな、1993年3月23日 - )は、日本のAV女優。"
kytea_wrapper = KyteaWrapper()
token_objects = kytea_wrapper.tokenize(sentence=test_sentence,
return_list=False,
is_feature=True
)
filtered_result = kytea_wrapper.filter(
parsed_sentence=token_objects,
stopwords=stopword
)
check_flag = True
for stem_posTuple in filtered_result.convert_list_object():
assert isinstance(stem_posTuple, tuple)
word_stem = stem_posTuple[0]
word_posTuple = stem_posTuple[1]
assert isinstance(word_stem, str)
assert isinstance(word_posTuple, tuple)
if word_stem in stopword: check_flag = False
assert check_flag
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/test_mecab_wrapper_python2.py
================================================
#! -*- coding: utf-8 -*-
__author__ = 'kensuke-mi'
import sys
import unittest
from JapaneseTokenizer.mecab_wrapper.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence
from six import string_types
import os
python_version = sys.version_info
class TestMecabWrapperPython2(unittest.TestCase):
def setUp(self):
self.test_senetence = u'紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優。'
self.test_sentence2 = u'午前零時。午前3時。3時。'
self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')
def test_neologd_parse(self):
"""* Test case
- neologd辞書で正しく分割できることを確認する
"""
mecab_obj = MecabWrapper(dictType='neologd')
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence)
self.assertTrue(parsed_obj, TokenizedSenetence)
self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))
self.assertTrue(all(isinstance(mrph, string_types) for mrph in parsed_obj.convert_list_object()))
parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2)
self.assertTrue(parsed_obj, TokenizedSenetence)
self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))
self.assertTrue(all(isinstance(mrph, string_types) for mrph in parsed_obj.convert_list_object()))
def test_default_parse(self):
"""* Test case
- デフォルトの状態で動作を確認する
"""
dictType = "ipadic"
mecab_obj = MecabWrapper(dictType=dictType)
assert isinstance(mecab_obj, MecabWrapper)
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
assert isinstance(parsed_obj, list)
if python_version >= (3, 0, 0):
for morph in parsed_obj:
assert isinstance(morph, str)
else:
for morph in parsed_obj:
assert isinstance(morph, string_types)
def test_init_userdict(self):
# test when user dictionary is called
mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)
assert isinstance(mecab_obj, MecabWrapper)
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
is_ok = False
for morph in parsed_obj:
if u'さくらまな' == morph:
is_ok = True
else:
pass
assert is_ok
def test_parse_jumandic(self):
with self.assertRaises(Exception):
mecab_obj = MecabWrapper(dictType='jumandic')
assert isinstance(mecab_obj, MecabWrapper)
def test_init_alldict(self):
"""* Test case
- すべての辞書を利用した場合の動作を確認する
"""
with self.assertRaises(Exception):
mecab_obj = MecabWrapper(dictType='all', pathUserDictCsv=self.path_user_dict)
assert isinstance(mecab_obj, MecabWrapper)
if __name__ == '__main__':
unittest.main()
================================================
FILE: test/test_mecab_wrapper_python3.py
================================================
#! -*- coding: utf-8 -*-
__author__ = 'kensuke-mi'
import sys
import unittest
from JapaneseTokenizer.mecab_wrapper.mecab_wrapper import MecabWrapper
from JapaneseTokenizer.datamodels import TokenizedSenetence
import os
python_version = sys.version_info
class TestMecabWrapperPython3(unittest.TestCase):
def setUp(self):
self.test_senetence = '紗倉 まな(さくらまな、1993年3月23日 - )は、日本のAV女優。'
self.test_sentence2 = '午前零時。午前3時。3時。'
self.path_user_dict = os.path.join(os.path.dirname(__file__), 'resources/test/userdict.csv')
def test_neologd_parse(self):
# test using neologd dictionary
mecab_obj = MecabWrapper(dictType='neologd')
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence)
self.assertTrue(parsed_obj, TokenizedSenetence)
self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))
self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object()))
parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2)
self.assertTrue(parsed_obj, TokenizedSenetence)
self.assertTrue(isinstance(parsed_obj.convert_list_object(), list))
self.assertTrue(all(isinstance(mrph, str) for mrph in parsed_obj.convert_list_object()))
def test_default_parse(self):
# test default status
dictType = "ipadic"
mecab_obj = MecabWrapper(dictType=dictType)
assert isinstance(mecab_obj, MecabWrapper)
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=True)
assert isinstance(parsed_obj, list)
for morph in parsed_obj:
assert isinstance(morph, str)
parsed_obj = mecab_obj.tokenize(sentence=self.test_sentence2, return_list=True)
assert isinstance(parsed_obj, list)
for morph in parsed_obj:
assert isinstance(morph, str)
def test_parse_jumandic(self):
mecab_obj = MecabWrapper(dictType='jumandic')
assert isinstance(mecab_obj, MecabWrapper)
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
assert isinstance(parsed_obj, TokenizedSenetence)
for tokenized_obj in parsed_obj.tokenized_objects:
if tokenized_obj.word_stem == '女優':
# ドメイン:文化・芸術 is special output only in Jumandic
assert 'ドメイン:文化・芸術' in tokenized_obj.analyzed_line
def test_parse_userdic(self):
pass
def test_parse_dictionary_path(self):
# put path to dictionary and parse sentence.
path_default_ipadic = '/usr/local/lib/mecab/dic/mecab-ipadic-neologd'
if os.path.exists(path_default_ipadic):
mecab_obj = MecabWrapper(dictType=None, path_dictionary=path_default_ipadic)
assert mecab_obj._path_dictionary == path_default_ipadic
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
assert isinstance(parsed_obj, TokenizedSenetence)
def test_init_userdict(self):
# this test should be error response.
mecab_obj = MecabWrapper(dictType='ipadic', pathUserDictCsv=self.path_user_dict)
assert isinstance(mecab_obj, MecabWrapper)
parsed_obj = mecab_obj.tokenize(sentence=self.test_senetence, return_list=False)
assert isinstance(parsed_obj, TokenizedSenetence)
is_ok = False
for tokenized_obj in parsed_obj.tokenized_objects:
if tokenized_obj.word_stem == 'さくらまな':
is_ok = True
assert is_ok
if __name__ == '__main__':
unittest.main()
================================================
FILE: travis-mecab-install.sh
================================================
#!/bin/bash
# from https://gist.github.com/dtan4/351d031bec0c3d45cd8f
# see also http://qiita.com/dtan4/items/c6a087666296fbd5fffb
base_dir=`pwd`
wget -O mecab-0.996.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7cENtOXlicTFaRUE'
tar zxfv mecab-0.996.tar.gz
cd mecab-0.996
./configure --enable-utf8-only
make
make check
sudo make install
sudo ldconfig
cd $base_dir
wget -O mecab-ipadic-2.7.0-20070801.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7MWVlSDBCSXZMTXM'
tar zxfv mecab-ipadic-2.7.0-20070801.tar.gz
cd mecab-ipadic-2.7.0-20070801
./configure --with-charset=utf8
make
sudo make install
sudo ldconfig
wget -O jumandic.tar.gz 'https://drive.google.com/uc?export=download&id=0B4y35FiV1wh7X2pESGlLREpxdXM'
tar zxfv jumandic.tar.gz
cd mecab-jumandic-7.0-20130310
./configure --with-charset=utf8
make
sudo make install
sudo ldconfig
cd $base_dir
rm -rf mecab-0.996 mecab-ipadic-2.7.0-20070801 mecab-jumandic-7.0-20130310
gitextract_jdhxzz3y/ ├── .gitignore ├── .travis.yml ├── JapaneseTokenizer/ │ ├── __init__.py │ ├── common/ │ │ ├── __init__.py │ │ ├── juman_utils.py │ │ ├── sever_handler.py │ │ ├── text_preprocess.py │ │ └── timeout_handler.py │ ├── datamodels.py │ ├── init_logger.py │ ├── juman_wrapper/ │ │ ├── __init__.py │ │ └── juman_wrapper.py │ ├── jumanpp_wrapper/ │ │ ├── __init__.py │ │ └── jumanpp_wrapper.py │ ├── kytea_wrapper/ │ │ ├── __init__.py │ │ └── kytea_wrapper.py │ ├── mecab_wrapper/ │ │ ├── __init__.py │ │ └── mecab_wrapper.py │ └── object_models.py ├── LICENSE.txt ├── MANIFEST.in ├── Makefile ├── README.md ├── examples/ │ ├── examples.py │ ├── userdict.csv │ └── userdict.dict ├── install_tokenizers.sh ├── setup.py ├── test/ │ ├── Dockerfile │ ├── Dockerfile-dev │ ├── __init__.py │ ├── common/ │ │ ├── __init__.py │ │ └── test_server_handler.py │ ├── docker-compose-dev.yml │ ├── docker-compose.yml │ ├── requirements_py2.txt │ ├── requirements_py3.txt │ ├── resources/ │ │ └── test/ │ │ ├── userdict.csv │ │ └── userdict.dict │ ├── test_all.py │ ├── test_filter_python2.py │ ├── test_filter_python3.py │ ├── test_juman_wrapper_python2.py │ ├── test_juman_wrapper_python3.py │ ├── test_jumanpp_wrapper_python2.py │ ├── test_jumanpp_wrapper_python3.py │ ├── test_kytea_wrapper_python2.py │ ├── test_kytea_wrapper_python3.py │ ├── test_mecab_wrapper_python2.py │ └── test_mecab_wrapper_python3.py └── travis-mecab-install.sh
SYMBOL INDEX (160 symbols across 24 files)
FILE: JapaneseTokenizer/common/juman_utils.py
function extract_morphological_information (line 9) | def extract_morphological_information(mrph_object, is_feature, is_surface):
function feature_parser (line 42) | def feature_parser(uni_feature, word_surface):
FILE: JapaneseTokenizer/common/sever_handler.py
class ProcessDownException (line 22) | class ProcessDownException(Exception):
class UnixProcessHandler (line 26) | class UnixProcessHandler(object):
method __init__ (line 27) | def __init__(self,
method __del__ (line 40) | def __del__(self):
method launch_process (line 44) | def launch_process(self, command):
method restart_process (line 69) | def restart_process(self):
method stop_process (line 80) | def stop_process(self):
method __query (line 92) | def __query(self, input_string):
method __notify_handler (line 114) | def __notify_handler(self, signum, frame):
method query (line 119) | def query(self, input_string):
class JumanppHnadler (line 124) | class JumanppHnadler(UnixProcessHandler):
method __init__ (line 126) | def __init__(self,
method launch_jumanpp_process (line 134) | def launch_jumanpp_process(self, command):
FILE: JapaneseTokenizer/common/text_preprocess.py
function u (line 17) | def u(str): return str.decode("utf-8")
function b (line 18) | def b(str): return str
function u (line 21) | def u(str): return str
function b (line 22) | def b(str): return str.encode("utf-8")
function denormalize_text (line 35) | def denormalize_text(input_text):
function normalize_text (line 50) | def normalize_text(input_text,
function normalize_text_normal_ipadic (line 77) | def normalize_text_normal_ipadic(input_text, kana=True, ascii=True, digi...
FILE: JapaneseTokenizer/common/timeout_handler.py
class TimeoutException (line 5) | class TimeoutException(Exception):
function handler_func (line 9) | def handler_func(msg):
function on_timeout (line 13) | def on_timeout(limit, handler=handler_func, hint=None):
FILE: JapaneseTokenizer/datamodels.py
function __is_sotpwords (line 16) | def __is_sotpwords(token, stopwords):
function __is_valid_pos (line 25) | def __is_valid_pos(pos_tuple, valid_pos):
function filter_words (line 46) | def filter_words(tokenized_obj, valid_pos, stopwords, check_field_name='...
class TokenizedResult (line 94) | class TokenizedResult(object):
method __init__ (line 95) | def __init__(self,
class TokenizedSenetence (line 127) | class TokenizedSenetence(object):
method __init__ (line 128) | def __init__(self, sentence, tokenized_objects, string_encoding='utf-8'):
method __extend_token_object (line 143) | def __extend_token_object(self, token_object,
method convert_list_object (line 176) | def convert_list_object(self,
method __convert_string_type (line 195) | def __convert_string_type(self, p_c_tuple):
method __check_pos_condition (line 217) | def __check_pos_condition(self, pos_condistion):
method filter (line 227) | def filter(self,
class FilteredObject (line 287) | class FilteredObject(TokenizedSenetence):
method __init__ (line 288) | def __init__(self, sentence, tokenized_objects, pos_condition, stopwor...
FILE: JapaneseTokenizer/init_logger.py
function init_logger (line 23) | def init_logger(logger):
FILE: JapaneseTokenizer/juman_wrapper/juman_wrapper.py
class MonkeyPatchSocket (line 31) | class MonkeyPatchSocket(object):
method __init__ (line 33) | def __init__(self, hostname, port, option=None):
method __del__ (line 46) | def __del__(self):
method query (line 50) | def query(self, sentence, pattern):
class JumanWrapper (line 66) | class JumanWrapper(WrapperBase):
method __init__ (line 67) | def __init__(self,
method __del__ (line 124) | def __del__(self):
method __monkey_patch_juman_lines (line 129) | def __monkey_patch_juman_lines(self, input_str):
method __extract_morphological_information (line 147) | def __extract_morphological_information(self, mrph_object, is_feature,...
method call_juman_interface (line 178) | def call_juman_interface(self, input_str):
method tokenize (line 194) | def tokenize(self,
method filter (line 232) | def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
FILE: JapaneseTokenizer/jumanpp_wrapper/jumanpp_wrapper.py
class JumanppClient (line 34) | class JumanppClient(object):
method __init__ (line 36) | def __init__(self, hostname, port, timeout=50, option=None):
method __del__ (line 50) | def __del__(self):
method query (line 53) | def query(self, sentence, pattern):
method __init__ (line 69) | def __init__(self, hostname, port, timeout=50, option=None):
method __del__ (line 85) | def __del__(self):
method query (line 89) | def query(self, sentence, pattern):
class JumanppClient (line 67) | class JumanppClient(object):
method __init__ (line 36) | def __init__(self, hostname, port, timeout=50, option=None):
method __del__ (line 50) | def __del__(self):
method query (line 53) | def query(self, sentence, pattern):
method __init__ (line 69) | def __init__(self, hostname, port, timeout=50, option=None):
method __del__ (line 85) | def __del__(self):
method query (line 89) | def query(self, sentence, pattern):
class JumanppWrapper (line 104) | class JumanppWrapper(WrapperBase):
method __init__ (line 107) | def __init__(self,
method __del__ (line 172) | def __del__(self):
method call_juman_interface (line 183) | def call_juman_interface(self, input_str):
method tokenize (line 220) | def tokenize(self, sentence,
method filter (line 256) | def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
FILE: JapaneseTokenizer/kytea_wrapper/kytea_wrapper.py
class KyteaWrapper (line 24) | class KyteaWrapper(WrapperBase):
method __init__ (line 25) | def __init__(self,
method __list_tags (line 32) | def __list_tags(self, t):
method __check_char_set (line 36) | def __check_char_set(self, input_char):
method __extract_morphological_information (line 45) | def __extract_morphological_information(self, kytea_tags_tuple, is_fea...
method call_kytea_tokenize_api (line 88) | def call_kytea_tokenize_api(self, sentence):
method tokenize (line 96) | def tokenize(self, sentence,
method filter (line 135) | def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
FILE: JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py
class MecabWrapper (line 32) | class MecabWrapper(WrapperBase):
method __init__ (line 33) | def __init__(self,
method __get_path_to_mecab_config (line 79) | def __get_path_to_mecab_config(self):
method __check_mecab_dict_path (line 92) | def __check_mecab_dict_path(self):
method __check_mecab_libexe (line 113) | def __check_mecab_libexe(self):
method __CallMecab (line 132) | def __CallMecab(self):
method __CompileUserdict (line 175) | def __CompileUserdict(self):
method __feature_parser (line 195) | def __feature_parser(self, uni_feature, word_surface):
method __postprocess_analyzed_result (line 219) | def __postprocess_analyzed_result(self, string_mecab_parsed_result, is...
method __result_parser (line 239) | def __result_parser(self, analyzed_line, is_feature, is_surface):
method tokenize (line 261) | def tokenize(self, sentence,
method filter (line 335) | def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
FILE: JapaneseTokenizer/object_models.py
class WrapperBase (line 5) | class WrapperBase(object):
method tokenize (line 6) | def tokenize(self,
method filter (line 17) | def filter(self, parsed_sentence, pos_condition=None, stopwords=None):
FILE: examples/examples.py
function basic_example (line 20) | def basic_example():
function filtering_example (line 53) | def filtering_example():
function advanced_example_mecab (line 91) | def advanced_example_mecab():
function advanced_example_juman (line 124) | def advanced_example_juman():
FILE: test/common/test_server_handler.py
class TestServerHandler (line 19) | class TestServerHandler(unittest.TestCase):
method setUpClass (line 21) | def setUpClass(cls):
method test_jumanpp_process_hanlder_normal (line 30) | def test_jumanpp_process_hanlder_normal(self):
method test_jumanpp_process_handler_timeout_exception (line 41) | def test_jumanpp_process_handler_timeout_exception(self):
method test_jumanpp_process_handler_init_exception (line 50) | def test_jumanpp_process_handler_init_exception(self):
method test_jumanpp_process_handler_huge_request (line 56) | def test_jumanpp_process_handler_huge_request(self):
FILE: test/test_all.py
function suite (line 9) | def suite():
function suite_with_jumanpp (line 33) | def suite_with_jumanpp():
FILE: test/test_filter_python2.py
class TestFilter (line 10) | class TestFilter(unittest.TestCase):
method setUp (line 11) | def setUp(self):
method test_filtering (line 18) | def test_filtering(self):
FILE: test/test_filter_python3.py
class TestFilter (line 10) | class TestFilter(unittest.TestCase):
method setUp (line 11) | def setUp(self):
method test_filtering (line 18) | def test_filtering(self):
FILE: test/test_juman_wrapper_python2.py
class TestJumanWrapperPython2 (line 20) | class TestJumanWrapperPython2(unittest.TestCase):
method setUp (line 21) | def setUp(self):
method test_juman_wrapper (line 24) | def test_juman_wrapper(self):
method test_tokenize (line 39) | def test_tokenize(self):
method test_filter_pos (line 76) | def test_filter_pos(self):
method test_stopwords (line 118) | def test_stopwords(self):
method test_juman_server_mode (line 144) | def test_juman_server_mode(self):
FILE: test/test_juman_wrapper_python3.py
class TestJumanWrapperPython3 (line 16) | class TestJumanWrapperPython3(unittest.TestCase):
method setUp (line 17) | def setUp(self):
method test_juman_wrapper (line 22) | def test_juman_wrapper(self):
method test_tokenize (line 35) | def test_tokenize(self):
method test_filter_pos (line 71) | def test_filter_pos(self):
method test_stopwords (line 112) | def test_stopwords(self):
method test_juman_severmode (line 139) | def test_juman_severmode(self):
FILE: test/test_jumanpp_wrapper_python2.py
class TestJumanppWrapperPython2 (line 17) | class TestJumanppWrapperPython2(unittest.TestCase):
method setUp (line 18) | def setUp(self):
method test_JumanppClient (line 23) | def test_JumanppClient(self):
method test_jumanpp_servermode (line 38) | def test_jumanpp_servermode(self):
method test_jumanpp_servermode_stress (line 64) | def test_jumanpp_servermode_stress(self):
method test_jumanpp_localmode_pyexpect (line 84) | def test_jumanpp_localmode_pyexpect(self):
method test_jumanpp_huge_amount_text (line 96) | def test_jumanpp_huge_amount_text(self):
FILE: test/test_jumanpp_wrapper_python3.py
class TestJumanppWrapperPython3 (line 17) | class TestJumanppWrapperPython3(unittest.TestCase):
method setUp (line 18) | def setUp(self):
method test_JumanppClient (line 23) | def test_JumanppClient(self):
method test_jumanpp_servermode (line 40) | def test_jumanpp_servermode(self):
method test_jumanpp_servermode_stress (line 69) | def test_jumanpp_servermode_stress(self):
method test_jumanpp_localmode_pyexpect (line 89) | def test_jumanpp_localmode_pyexpect(self):
method test_jumanpp_huge_amount_text (line 102) | def test_jumanpp_huge_amount_text(self):
FILE: test/test_kytea_wrapper_python2.py
class TestKyteaWrapperPython2 (line 6) | class TestKyteaWrapperPython2(unittest.TestCase):
method setUp (line 8) | def setUp(self):
method test_tokenization (line 11) | def test_tokenization(self):
method test_filter_pos (line 30) | def test_filter_pos(self):
method test_stopwords (line 67) | def test_stopwords(self):
FILE: test/test_kytea_wrapper_python3.py
class TestKyteaWrapperPython3 (line 6) | class TestKyteaWrapperPython3(unittest.TestCase):
method setUp (line 8) | def setUp(self):
method test_tokenization (line 11) | def test_tokenization(self):
method test_filter_pos (line 30) | def test_filter_pos(self):
method test_stopwords (line 65) | def test_stopwords(self):
FILE: test/test_mecab_wrapper_python2.py
class TestMecabWrapperPython2 (line 13) | class TestMecabWrapperPython2(unittest.TestCase):
method setUp (line 14) | def setUp(self):
method test_neologd_parse (line 19) | def test_neologd_parse(self):
method test_default_parse (line 34) | def test_default_parse(self):
method test_init_userdict (line 50) | def test_init_userdict(self):
method test_parse_jumandic (line 63) | def test_parse_jumandic(self):
method test_init_alldict (line 68) | def test_init_alldict(self):
FILE: test/test_mecab_wrapper_python3.py
class TestMecabWrapperPython3 (line 12) | class TestMecabWrapperPython3(unittest.TestCase):
method setUp (line 13) | def setUp(self):
method test_neologd_parse (line 18) | def test_neologd_parse(self):
method test_default_parse (line 31) | def test_default_parse(self):
method test_parse_jumandic (line 47) | def test_parse_jumandic(self):
method test_parse_userdic (line 58) | def test_parse_userdic(self):
method test_parse_dictionary_path (line 61) | def test_parse_dictionary_path(self):
method test_init_userdict (line 70) | def test_init_userdict(self):
Condensed preview — 51 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (156K chars).
[
{
"path": ".gitignore",
"chars": 166,
"preview": ".idea/\nJapaneseTokenizer.egg-info/\nbuild/\ndist/\n*eggs/\npyknp.egg-info/\n.python-version\n*pyc\nmorphogySplitters/\nMykytea-p"
},
{
"path": ".travis.yml",
"chars": 1209,
"preview": "language: python\npython:\n - 2.7\n - 3.5\naddons:\n apt:\n packages:\n - git\n - make\n - curl\n - xz-utils\n "
},
{
"path": "JapaneseTokenizer/__init__.py",
"chars": 347,
"preview": "from JapaneseTokenizer.mecab_wrapper import MecabWrapper\nfrom JapaneseTokenizer.juman_wrapper import JumanWrapper\nfrom J"
},
{
"path": "JapaneseTokenizer/common/__init__.py",
"chars": 26,
"preview": "__author__ = 'kensuke-mi'\n"
},
{
"path": "JapaneseTokenizer/common/juman_utils.py",
"chars": 1963,
"preview": "from JapaneseTokenizer.datamodels import TokenizedResult, TokenizedSenetence\nfrom typing import Tuple\nimport pyknp\nfrom "
},
{
"path": "JapaneseTokenizer/common/sever_handler.py",
"chars": 4619,
"preview": "#! -*- coding: utf-8 -*-\nimport subprocess\nfrom subprocess import Popen, PIPE, STDOUT\nimport multiprocessing\n# socket ob"
},
{
"path": "JapaneseTokenizer/common/text_preprocess.py",
"chars": 2724,
"preview": "# -*- coding: utf-8 -*-\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ imp"
},
{
"path": "JapaneseTokenizer/common/timeout_handler.py",
"chars": 840,
"preview": "#! -*- coding: utf-8 -*-\nfrom functools import wraps\n\n\nclass TimeoutException(Exception):\n pass\n\n\ndef handler_func(ms"
},
{
"path": "JapaneseTokenizer/datamodels.py",
"chars": 11609,
"preview": "#! -*- coding: utf-8 -*-\n# normalize module #\nfrom JapaneseTokenizer.common.text_preprocess import normalize_text, denor"
},
{
"path": "JapaneseTokenizer/init_logger.py",
"chars": 666,
"preview": "LOGGER_NAME = 'JapaneseTokenizer'\n\nimport logging\nimport sys\nfrom logging import getLogger, Formatter, Logger, StreamHan"
},
{
"path": "JapaneseTokenizer/juman_wrapper/__init__.py",
"chars": 66,
"preview": "__author__ = 'kensuke-mi'\nfrom .juman_wrapper import JumanWrapper\n"
},
{
"path": "JapaneseTokenizer/juman_wrapper/juman_wrapper.py",
"chars": 9338,
"preview": "# -*- coding: utf-8 -*-\n# package module\nfrom JapaneseTokenizer.object_models import WrapperBase\nfrom JapaneseTokenizer."
},
{
"path": "JapaneseTokenizer/jumanpp_wrapper/__init__.py",
"chars": 43,
"preview": "from .jumanpp_wrapper import JumanppWrapper"
},
{
"path": "JapaneseTokenizer/jumanpp_wrapper/jumanpp_wrapper.py",
"chars": 10558,
"preview": "#! -*- coding: utf-8 -*-\nfrom pyknp import Juman\nfrom pyknp import MList\n# modules\nfrom JapaneseTokenizer.object_models "
},
{
"path": "JapaneseTokenizer/kytea_wrapper/__init__.py",
"chars": 65,
"preview": "__author__ = 'kensuke-mi'\nfrom .kytea_wrapper import KyteaWrapper"
},
{
"path": "JapaneseTokenizer/kytea_wrapper/kytea_wrapper.py",
"chars": 4948,
"preview": "# -*- coding: utf-8 -*-\nfrom JapaneseTokenizer.object_models import WrapperBase\nfrom JapaneseTokenizer.common import tex"
},
{
"path": "JapaneseTokenizer/mecab_wrapper/__init__.py",
"chars": 66,
"preview": "__author__ = 'kensuke-mi'\nfrom .mecab_wrapper import MecabWrapper\n"
},
{
"path": "JapaneseTokenizer/mecab_wrapper/mecab_wrapper.py",
"chars": 15346,
"preview": "#! -*- coding: utf-8 -*-\n# core module\nfrom JapaneseTokenizer.object_models import WrapperBase\nfrom JapaneseTokenizer.co"
},
{
"path": "JapaneseTokenizer/object_models.py",
"chars": 571,
"preview": "#! -*- coding: utf-8 -*-\nfrom typing import Callable\nfrom six import text_type\n\nclass WrapperBase(object):\n def token"
},
{
"path": "LICENSE.txt",
"chars": 1056,
"preview": "Copyright 2017 Kensuke Mitsuzawa\n\nPermission is hereby granted, free of charge, to any person obtaining a copy of this s"
},
{
"path": "MANIFEST.in",
"chars": 136,
"preview": "include README.md\ninclude README_JP.md\ninclude examples\ninclude test\ninclude install_tokenizers.sh\ninclude LICENSE.txt\ni"
},
{
"path": "Makefile",
"chars": 389,
"preview": "install:\n\tbash install_tokenizers.sh\n\ninstall_neologd:\n\t## mecab-neologdのインストールを実行\n\twget --no-check-certificate https://"
},
{
"path": "README.md",
"chars": 6179,
"preview": "[](LICENSE)[:\n su"
},
{
"path": "test/test_filter_python2.py",
"chars": 1708,
"preview": "#! -*- coding: utf-8 -*-\nimport sys\nimport unittest\nfrom JapaneseTokenizer.mecab_wrapper import MecabWrapper\nfrom Japane"
},
{
"path": "test/test_filter_python3.py",
"chars": 1798,
"preview": "#! -*- coding: utf-8 -*-\nimport sys\nimport unittest\nfrom JapaneseTokenizer.mecab_wrapper import MecabWrapper\nfrom Japane"
},
{
"path": "test/test_juman_wrapper_python2.py",
"chars": 6857,
"preview": "#-*- encoding: utf-8 -*-\n# this test file does not work under pycharm\n# do your test with command line\nfrom __future__ i"
},
{
"path": "test/test_juman_wrapper_python3.py",
"chars": 7379,
"preview": "#-*- encoding: utf-8 -*-\n# this test file does not work under pycharm\n# do your test with command line\nfrom pyknp import"
},
{
"path": "test/test_jumanpp_wrapper_python2.py",
"chars": 4998,
"preview": "#-*- encoding: utf-8 -*-\n# this test file does not work under pycharm\n# do your test with command line\nfrom pyknp import"
},
{
"path": "test/test_jumanpp_wrapper_python3.py",
"chars": 5187,
"preview": "#-*- encoding: utf-8 -*-\n# this test file does not work under pycharm\n# do your test with command line\nfrom pyknp import"
},
{
"path": "test/test_kytea_wrapper_python2.py",
"chars": 3516,
"preview": "# -*- coding: utf-8 -*-\nfrom JapaneseTokenizer.kytea_wrapper import KyteaWrapper\nfrom JapaneseTokenizer.datamodels impor"
},
{
"path": "test/test_kytea_wrapper_python3.py",
"chars": 3423,
"preview": "# -*- coding: utf-8 -*-\nfrom JapaneseTokenizer.kytea_wrapper import KyteaWrapper\nfrom JapaneseTokenizer.datamodels impor"
},
{
"path": "test/test_mecab_wrapper_python2.py",
"chars": 2948,
"preview": "#! -*- coding: utf-8 -*-\n__author__ = 'kensuke-mi'\n\nimport sys\nimport unittest\nfrom JapaneseTokenizer.mecab_wrapper.meca"
},
{
"path": "test/test_mecab_wrapper_python3.py",
"chars": 3602,
"preview": "#! -*- coding: utf-8 -*-\n__author__ = 'kensuke-mi'\n\nimport sys\nimport unittest\nfrom JapaneseTokenizer.mecab_wrapper.meca"
},
{
"path": "travis-mecab-install.sh",
"chars": 971,
"preview": "#!/bin/bash\n# from https://gist.github.com/dtan4/351d031bec0c3d45cd8f\n# see also http://qiita.com/dtan4/items/c6a0876662"
}
]
// ... and 2 more files (download for full content)
About this extraction
This page contains the full source code of the Kensuke-Mitsuzawa/JapaneseTokenizers GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 51 files (140.3 KB), approximately 36.7k tokens, and a symbol index with 160 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.