Repository: Moonshile/ChineseWordSegmentation Branch: master Commit: 9203eb1f572f Files: 10 Total size: 19.6 KB Directory structure: gitextract_av6qteg7/ ├── .gitignore ├── LICENSE ├── README.md ├── setup.py └── wordseg/ ├── __init__.py ├── freqitem.py ├── hashtree.py ├── probability.py ├── sequence.py └── wordseg.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] # C extensions *.so # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *,cover # Translations *.mo *.pot # Django stuff: *.log # Sphinx documentation docs/_build/ # PyBuilder target/ ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2015 Kaiqiang Dawn Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # ChineseWordSegmentation Chinese word segmentation algorithm without corpus ## Usage ``` from wordseg import WordSegment doc = u'十四是十四四十是四十,十四不是四十,四十不是十四' ws = WordSegment(doc, max_word_len=2, min_aggregation=1, min_entropy=0.5) ws.segSentence(doc) ``` This will generate words `十四 是 十四 四十 是 四十 , 十四 不是 四十 , 四十 不是 十四` In fact, `doc` should be a long enough document string for better results. In that condition, the min_aggregation should be set far greater than 1, such as 50, and min_entropy should also be set greater than 0.5, such as 1.5. Besides, both input and output of this function should be decoded as unicode. `WordSegment.segSentence` has an optional argument `method`, with values `WordSegment.L`, `WordSegment.S` and `WordSegment.ALL`, means + `WordSegment.L`: if a long word that is combinations of several shorter words found, given only the long word. + `WordSegment.S`: given the several shorter words. + `WordSegment.ALL`: given both the long and the shorters. ## Reference Thanks Matrix67's [article](http://www.matrix67.com/blog/archives/5044) ================================================ FILE: setup.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import from __future__ import print_function from __future__ import unicode_literals import pkg_resources from setuptools import setup, find_packages import os import codecs import re import sys def read(*parts): path = os.path.join(os.path.dirname(__file__), *parts) with codecs.open(path, encoding='utf-8') as fobj: return fobj.read() def find_version(*file_paths): version_file = read(*file_paths) version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) if version_match: return version_match.group(1) raise RuntimeError("Unable to find version string.") setup( name='wordseg', version=find_version("wordseg", "__init__.py"), description='Chinese word segmentation algorithm without corpus', author='段凯强', author_email='', license='MIT', keywords='NLP,tokenizing,Chinese word segementation', url='https://github.com/bung87/ChineseWordSegmentation', packages = find_packages(), package_dir={'wordseg': 'wordseg'}, classifiers=[ 'Intended Audience :: Developers', 'License :: OSI Approved :: MIT License', 'Operating System :: OS Independent', 'Natural Language :: Chinese (Simplified)', 'Natural Language :: Chinese (Traditional)', 'Programming Language :: Python', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Topic :: Text Processing', 'Topic :: Text Processing :: Indexing', 'Topic :: Text Processing :: Linguistic', ] ) ================================================ FILE: wordseg/__init__.py ================================================ __version__ = '0.1.0' from . import probability from . import sequence from . wordseg import WordSegment ================================================ FILE: wordseg/freqitem.py ================================================ #coding=utf-8 """ A simple frequent itemset mining algorithm implementation Author: 段凯强 """ import itertools from wordseg.sequence import dedup from wordseg.hashtree import HashTree, sameNodes from functools import reduce class FreqItem(object): def __init__(self, transactions, sup_theta=.1): self.transactions = [sorted(t) for t in [x for x in transactions if x]] self.sup_theta = sup_theta*len(transactions) self.freqset = [] def filterCandidates(self, cand): """ Build a HashTree with candidates cand, then count support of these candidates to filter out all those that have support not lower than sup_theta """ hashtree = HashTree(cand) hashtree.count(self.transactions) return hashtree.getNodes(self.sup_theta) def freqOneSet(self): """ Generate frequent 1-item sets """ one_item_cand = set() for t in self.transactions: for w in t: one_item_cand.add(w) return sorted(self.filterCandidates([[i] for i in one_item_cand]), key=lambda i: i[0].name) def genNextCand(self, preItems): """ Generate next candidates by dynamic programming Find range [i, j) such that items in this range have same prefix e.g., [1,2,3,4] and [1,2,3,5] have same prefix, so they should be in one same range Then, generate 2-combinations of these ranges as result """ res = [] i, j = 0, 0 while i < len(preItems): if j < len(preItems) and sameNodes(preItems[j][:-1], preItems[i][:-1]): j += 1 else: res += [pair[0] + [pair[1][-1]] for pair in itertools.combinations(preItems[i:j], 2)] i = j return [[i.name for i in items] for items in res] def genFreqItemSets(self): """ @return Frequent item sets with their frequency """ if self.freqset: return self.freqset cur = self.freqOneSet() freqKSet = [] while cur: freqKSet.append(cur) cur = self.filterCandidates(self.genNextCand(cur)) self.freqset = reduce(lambda res, x: res + x, freqKSet, []) name_freq_pairs = [[(i.name, i.val) for i in items] for items in self.freqset[::-1]] res = [list(zip(*items)) for items in name_freq_pairs] return [(list(pair[0]), pair[1][-1]) for pair in res] if __name__ == '__main__': transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7], [5,7,2]] freqItem = FreqItem(transactions, sup_theta=.3) print(freqItem.genFreqItemSets()) ================================================ FILE: wordseg/hashtree.py ================================================ #coding=utf-8 """ A simple implementation of Hash Tree Author: 段凯强 """ from functools import reduce class HashTreeNode(object): def __init__(self, name=''): self.val = 0 self.name = name self.level = 0 self.children = {} def addBag(self, bag): """ Note that bag must be sorted """ if bag: node = self.children.get(bag[0], HashTreeNode(name=bag[0])) node.addBag(bag[1:]) self.children[bag[0]] = node self.level = len(bag) def count(self, transaction): """ count the child who matches bag, suppose that current node matches """ if self.level == 0: self.val += 1 elif self.level == 1: for t in transaction: if t in self.children: self.children[t].val += 1 else: for i in range(0, len(transaction)): t = transaction[i] if t in self.children: self.children[t].count(transaction[i:]) def get(self, theta): return [[c.name for c in items] for items in self.getNodes(theta)] """ if self.level == 0: return [[self.name]] if self.val >= theta else None else: children_res = [self.children[i].get(theta) for i in sorted(self.children.keys())] total = reduce(lambda res, x: res + x, filter(lambda x: x, children_res), []) return map(lambda c: [self.name] + c, total) """ def getNodes(self, theta): if self.level == 0: return [[self]] if self.val >= theta else None else: children_res = [self.children[i].getNodes(theta) for i in sorted(self.children.keys())] total = reduce(lambda res, x: res + x, [x for x in children_res if x], []) return [[self] + c for c in total] def __str__(self): return '(%s : %s)'%(self.name, '; '.join([str(i) for i in list(self.children.values())])) def sameNode(node1, node2): return node1.name == node2.name def sameNodes(nodes1, nodes2): func = lambda n: n.name return list(map(func, nodes1)) == list(map(func, nodes2)) class HashTree(object): """ Note that all bags must be sorted """ def __init__(self, bags): self.root = HashTreeNode() self.root.val = 0 for b in bags: if b: self.root.addBag(b) def count(self, transactions): for t in transactions: self.root.count(t) def get(self, theta): res = [c[1:] for c in self.root.get(theta)] return [] if res == [[]] else res def getNodes(self, theta): res = [c[1:] for c in self.root.getNodes(theta)] return [] if res == [[]] else res def __str__(self): return str(self.root) if __name__ == '__main__': to_count = [[1,2], [2,4], [1,3], [1,5], [3,4], [2,7], [6,8]] tree = HashTree(to_count) transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7]] tree.count(transactions) print('Frequency with transactions', transactions) print(tree.get(2)) print(tree.get(1)) ================================================ FILE: wordseg/probability.py ================================================ #coding=utf-8 """ Algorithms about probability Author: 段凯强 """ import math def entropyOfList(ls): """ Given a list of some items, compute entropy of the list The entropy is sum of -p[i]*log(p[i]) for every unique element i in the list, and p[i] is its frequency """ elements = {} for e in ls: elements[e] = elements.get(e, 0) + 1 length = float(len(ls)) # if length is 0, which means one side of a word is empty, which is determinated, so entropy should be 0 return length and sum([-v/length*math.log(v/length) for v in list(elements.values())]) ================================================ FILE: wordseg/sequence.py ================================================ #coding=utf-8 """ Algorithms for sequences Author: 段凯强 """ def dedup(ls): """ deduplicate the given SORTED list """ i, j = 0, 0 while j < len(ls): if ls[j] == ls[i]: j += 1 else: i += 1 ls[i] = ls[j] return ls[:i + 1] def genSubstr(string, n): """ Generate all substrings of max length n for string """ length = len(string) res = [] for i in range(0, length): for j in range(i + 1, min(i + n + 1, length + 1)): res.append(string[i: j]) return res def genSubparts(string): """ Partition a string into all possible two parts, e.g. given "abcd", generate [("a", "bcd"), ("ab", "cd"), ("abc", "d")] For string of length 1, return empty list """ length = len(string) res = [] for i in range(1, length): res.append((string[0:i], string[i:])) return res def longestSubsequenceLength(s1, s2): n = len(s2) + 1 cur = [0]*n next = [0]*n tmp = None for i in s1: for j in range(0, n): if j == 0: next[j] = 0 else: next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j]) tmp = next next = cur cur = tmp return cur[n - 1] def longestSubsequence(s1, s2): n = len(s2) + 1 cur = [0]*n next = [0]*n tmp = None __NONE, __UP, __LEFT, __NEW = 0, 1, 2, 3 orientation = [[__NONE]*n] for i in s1: ori = [] for j in range(0, n): if j == 0: next[j] = 0 ori.append(__NONE) else: next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j]) ori.append(__NEW if i == s2[j - 1] else (__LEFT if next[j - 1] > cur [j] else __UP)) orientation.append(ori) tmp = next next = cur cur = tmp i, j, res = len(s1), n - 1, '' ori = orientation[i][j] while ori != __NONE: if ori == __UP: i -= 1 elif ori == __LEFT: j -= 1 elif ori == __NEW: i -= 1 j -= 1 res += s2[j] ori = orientation[i][j] return res[::-1] ================================================ FILE: wordseg/wordseg.py ================================================ #coding=utf-8 """ Chinese word segmentation algorithm without corpus Author: 段凯强 Reference: http://www.matrix67.com/blog/archives/5044 """ import re from . probability import entropyOfList from . sequence import genSubparts, genSubstr def indexOfSortedSuffix(doc, max_word_len): """ Treat a suffix as an index where the suffix begins. Then sort these indexes by the suffixes. """ indexes = [] length = len(doc) for i in range(0, length): for j in range(i + 1, min(i + 1 + max_word_len, length + 1)): indexes.append((i, j)) return sorted(indexes, key=lambda i_j: doc[i_j[0]:i_j[1]]) class WordInfo(object): """ Store information of each word, including its freqency, left neighbors and right neighbors """ def __init__(self, text): super(WordInfo, self).__init__() self.text = text self.freq = 0.0 self.left = [] self.right = [] self.aggregation = 0 def update(self, left, right): """ Increase frequency of this word, then append left/right neighbors @param left a single character on the left side of this word @param right as left is, but on the right side """ self.freq += 1 if left: self.left.append(left) if right: self.right.append(right) def compute(self, length): """ Compute frequency and entropy of this word @param length length of the document for training to get words """ self.freq /= length self.left = entropyOfList(self.left) self.right = entropyOfList(self.right) def computeAggregation(self, words_dict): """ Compute aggregation of this word @param words_dict frequency dict of all candidate words """ parts = genSubparts(self.text) if len(parts) > 0: self.aggregation = min([self.freq/words_dict[p1_p2[0]].freq/words_dict[p1_p2[1]].freq for p1_p2 in parts]) class WordSegment(object): """ Main class for Chinese word segmentation 1. Generate words from a long enough document 2. Do the segmentation work with the document """ # if a word is combination of other shorter words, then treat it as a long word L = 0 # if a word is combination of other shorter words, then treat it as the set of shortest words S = 1 # if a word contains other shorter words, then return all possible results ALL = 2 def __init__(self, doc, max_word_len=5, min_freq=0.00005, min_entropy=2.0, min_aggregation=50): super(WordSegment, self).__init__() self.max_word_len = max_word_len self.min_freq = min_freq self.min_entropy = min_entropy self.min_aggregation = min_aggregation self.word_infos = self.genWords(doc) # Result infomations, i.e., average data of all words word_count = float(len(self.word_infos)) self.avg_len = sum([len(w.text) for w in self.word_infos])/word_count self.avg_freq = sum([w.freq for w in self.word_infos])/word_count self.avg_left_entropy = sum([w.left for w in self.word_infos])/word_count self.avg_right_entropy = sum([w.right for w in self.word_infos])/word_count self.avg_aggregation = sum([w.aggregation for w in self.word_infos])/word_count # Filter out the results satisfy all the requirements filter_func = lambda v: len(v.text) > 1 and v.aggregation > self.min_aggregation and\ v.freq > self.min_freq and v.left > self.min_entropy and v.right > self.min_entropy self.word_with_freq = [(w.text, w.freq) for w in list(filter(filter_func, self.word_infos))] self.words = [w[0] for w in self.word_with_freq] def genWords(self, doc): """ Generate all candidate words with their frequency/entropy/aggregation informations @param doc the document used for words generation """ pattern = re.compile('[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z,。《》、?:;“”‘’{}【】()…¥!—┄-]+') doc = re.sub(pattern, ' ', doc) suffix_indexes = indexOfSortedSuffix(doc, self.max_word_len) word_cands = {} # compute frequency and neighbors for suf in suffix_indexes: word = doc[suf[0]:suf[1]] if word not in word_cands: word_cands[word] = WordInfo(word) word_cands[word].update(doc[suf[0] - 1:suf[0]], doc[suf[1]:suf[1] + 1]) # compute probability and entropy length = len(doc) for k in word_cands: word_cands[k].compute(length) # compute aggregation of words whose length > 1 values = sorted(list(word_cands.values()), key=lambda x: len(x.text)) for v in values: if len(v.text) == 1: continue v.computeAggregation(word_cands) return sorted(values, key=lambda v: v.freq, reverse=True) def segSentence(self, sentence, method=ALL): """ Segment a sentence with the words generated from a document @param sentence the sentence to be handled @param method segmentation method """ i = 0 res = [] while i < len(sentence): if method == self.L or method == self.S: j_range = list(range(self.max_word_len, 0, -1)) if method == self.L else list(range(2, self.max_word_len + 1)) + [1] for j in j_range: if j == 1 or sentence[i:i + j] in self.words: res.append(sentence[i:i + j]) i += j break else: to_inc = 1 for j in range(2, self.max_word_len + 1): if i + j <= len(sentence) and sentence[i:i + j] in self.words: res.append(sentence[i:i + j]) if to_inc == 1: to_inc = j if to_inc == 1: res.append(sentence[i]) i += to_inc return res if __name__ == '__main__': doc = '十四是十四四十是四十,,十四不是四十,,,,四十不是十四' ws = WordSegment(doc, max_word_len=2, min_aggregation=1.2, min_entropy=0.4) print(' '.join(['%s:%f'%w for w in ws.word_with_freq])) print(' '.join(ws.words)) print(' '.join(ws.segSentence(doc))) print('average len: ', ws.avg_len) print('average frequency: ', ws.avg_freq) print('average left entropy: ', ws.avg_left_entropy) print('average right entropy: ', ws.avg_right_entropy) print('average aggregation: ', ws.avg_aggregation)