Repository: Moonshile/ChineseWordSegmentation
Branch: master
Commit: 9203eb1f572f
Files: 10
Total size: 19.6 KB

Directory structure:
gitextract_av6qteg7/

├── .gitignore
├── LICENSE
├── README.md
├── setup.py
└── wordseg/
    ├── __init__.py
    ├── freqitem.py
    ├── hashtree.py
    ├── probability.py
    ├── sequence.py
    └── wordseg.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

# C extensions
*.so

# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*,cover

# Translations
*.mo
*.pot

# Django stuff:
*.log

# Sphinx documentation
docs/_build/

# PyBuilder
target/


================================================
FILE: LICENSE
================================================
The MIT License (MIT)

Copyright (c) 2015 Kaiqiang Dawn

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# ChineseWordSegmentation
Chinese word segmentation algorithm without corpus

## Usage
```
from wordseg import WordSegment
doc = u'十四是十四四十是四十，十四不是四十，四十不是十四'
ws = WordSegment(doc, max_word_len=2, min_aggregation=1, min_entropy=0.5)
ws.segSentence(doc)
```

This will generate words

`十四 是 十四 四十 是 四十 ， 十四 不是 四十 ， 四十 不是 十四`

In fact, `doc` should be a long enough document string for better results. In that condition, the min_aggregation should be set far greater than 1, such as 50, and min_entropy should also be set greater than 0.5, such as 1.5.

Besides, both input and output of this function should be decoded as unicode.

`WordSegment.segSentence` has an optional argument `method`, with values `WordSegment.L`, `WordSegment.S` and `WordSegment.ALL`, means

+ `WordSegment.L`: if a long word that is combinations of several shorter words found, given only the long word.
+ `WordSegment.S`: given the several shorter words.
+ `WordSegment.ALL`: given both the long and the shorters.

## Reference

Thanks Matrix67's [article](http://www.matrix67.com/blog/archives/5044)

================================================
FILE: setup.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals
import pkg_resources
from setuptools import setup, find_packages
import os
import codecs
import re
import sys

def read(*parts):
    path = os.path.join(os.path.dirname(__file__), *parts)
    with codecs.open(path, encoding='utf-8') as fobj:
        return fobj.read()

def find_version(*file_paths):
    version_file = read(*file_paths)
    version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
                              version_file, re.M)
    if version_match:
        return version_match.group(1)
    raise RuntimeError("Unable to find version string.")

setup(
    name='wordseg',
    version=find_version("wordseg", "__init__.py"),
    description='Chinese word segmentation algorithm without corpus',
    author='段凯强',
    author_email='',
    license='MIT',
    keywords='NLP,tokenizing,Chinese word segementation',
    url='https://github.com/bung87/ChineseWordSegmentation',
    packages = find_packages(),
    package_dir={'wordseg': 'wordseg'},
    classifiers=[
        'Intended Audience :: Developers',
        'License :: OSI Approved :: MIT License',
        'Operating System :: OS Independent',
        'Natural Language :: Chinese (Simplified)',
        'Natural Language :: Chinese (Traditional)',
        'Programming Language :: Python',
        'Programming Language :: Python :: 2',
        'Programming Language :: Python :: 2.6',
        'Programming Language :: Python :: 2.7',
        'Programming Language :: Python :: 3',
        'Programming Language :: Python :: 3.2',
        'Programming Language :: Python :: 3.3',
        'Programming Language :: Python :: 3.4',
        'Topic :: Text Processing',
        'Topic :: Text Processing :: Indexing',
        'Topic :: Text Processing :: Linguistic',
      ]
)


================================================
FILE: wordseg/__init__.py
================================================
__version__ = '0.1.0'

from . import probability
from . import sequence
from . wordseg import WordSegment

================================================
FILE: wordseg/freqitem.py
================================================
#coding=utf-8

"""
A simple frequent itemset mining algorithm implementation
Author: 段凯强
"""

import itertools

from wordseg.sequence import dedup
from wordseg.hashtree import HashTree, sameNodes
from functools import reduce

class FreqItem(object):
    def __init__(self, transactions, sup_theta=.1):
        self.transactions = [sorted(t) for t in [x for x in transactions if x]]
        self.sup_theta = sup_theta*len(transactions)
        self.freqset = []

    def filterCandidates(self, cand):
        """
        Build a HashTree with candidates cand, then count support of these candidates to filter out
        all those that have support not lower than sup_theta
        """
        hashtree = HashTree(cand)
        hashtree.count(self.transactions)
        return hashtree.getNodes(self.sup_theta)

    def freqOneSet(self):
        """
        Generate frequent 1-item sets
        """
        one_item_cand = set()
        for t in self.transactions:
            for w in t:
                one_item_cand.add(w)
        return sorted(self.filterCandidates([[i] for i in one_item_cand]), key=lambda i: i[0].name)

    def genNextCand(self, preItems):
        """
        Generate next candidates by dynamic programming
        Find range [i, j) such that items in this range have same prefix
        e.g., [1,2,3,4] and [1,2,3,5] have same prefix, so they should be in one same range
        Then, generate 2-combinations of these ranges as result
        """
        res = []
        i, j = 0, 0
        while i < len(preItems):
            if j < len(preItems) and sameNodes(preItems[j][:-1], preItems[i][:-1]):
                j += 1
            else:
                res += [pair[0] + [pair[1][-1]] for pair in itertools.combinations(preItems[i:j], 2)]
                i = j
        return [[i.name for i in items] for items in res]

    def genFreqItemSets(self):
        """
        @return Frequent item sets with their frequency
        """
        if self.freqset: return self.freqset
        cur = self.freqOneSet()
        freqKSet = []
        while cur:
            freqKSet.append(cur)
            cur = self.filterCandidates(self.genNextCand(cur))
        self.freqset = reduce(lambda res, x: res + x, freqKSet, [])
        name_freq_pairs = [[(i.name, i.val) for i in items] for items in self.freqset[::-1]]
        res = [list(zip(*items)) for items in name_freq_pairs]
        return [(list(pair[0]), pair[1][-1]) for pair in res]

if __name__ == '__main__':
    transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7], [5,7,2]]
    freqItem = FreqItem(transactions, sup_theta=.3)
    print(freqItem.genFreqItemSets())


================================================
FILE: wordseg/hashtree.py
================================================
#coding=utf-8

"""
A simple implementation of Hash Tree
Author: 段凯强
"""
from functools import reduce

class HashTreeNode(object):
    def __init__(self, name=''):
        self.val = 0
        self.name = name
        self.level = 0
        self.children = {}

    def addBag(self, bag):
        """
        Note that bag must be sorted
        """
        if bag:
            node = self.children.get(bag[0], HashTreeNode(name=bag[0]))
            node.addBag(bag[1:])
            self.children[bag[0]] = node
            self.level = len(bag)

    def count(self, transaction):
        """
        count the child who matches bag, suppose that current node matches
        """
        if self.level == 0:
            self.val += 1
        elif self.level == 1:
            for t in transaction:
                if t in self.children: self.children[t].val += 1
        else:
            for i in range(0, len(transaction)):
                t = transaction[i]
                if t in self.children:
                    self.children[t].count(transaction[i:])

    def get(self, theta):
        return [[c.name for c in items] for items in self.getNodes(theta)]
        """
        if self.level == 0:
            return [[self.name]] if self.val >= theta else None
        else:
            children_res = [self.children[i].get(theta) for i in sorted(self.children.keys())]
            total = reduce(lambda res, x: res + x, filter(lambda x: x, children_res), [])
            return map(lambda c: [self.name] + c, total)
        """

    def getNodes(self, theta):
        if self.level == 0:
            return [[self]] if self.val >= theta else None
        else:
            children_res = [self.children[i].getNodes(theta) for i in sorted(self.children.keys())]
            total = reduce(lambda res, x: res + x, [x for x in children_res if x], [])
            return [[self] + c for c in total]

    def __str__(self):
        return '(%s : %s)'%(self.name, '; '.join([str(i) for i in list(self.children.values())]))

def sameNode(node1, node2):
    return node1.name == node2.name

def sameNodes(nodes1, nodes2):
    func = lambda n: n.name
    return list(map(func, nodes1)) == list(map(func, nodes2))


class HashTree(object):
    """
    Note that all bags must be sorted
    """
    def __init__(self, bags):
        self.root = HashTreeNode()
        self.root.val = 0
        for b in bags:
            if b: self.root.addBag(b)

    def count(self, transactions):
        for t in transactions: self.root.count(t)

    def get(self, theta):
        res = [c[1:] for c in self.root.get(theta)]
        return [] if res == [[]] else res

    def getNodes(self, theta):
        res = [c[1:] for c in self.root.getNodes(theta)]
        return [] if res == [[]] else res

    def __str__(self):
        return str(self.root)

if __name__ == '__main__':
    to_count = [[1,2], [2,4], [1,3], [1,5], [3,4], [2,7], [6,8]]
    tree = HashTree(to_count)
    transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7]]
    tree.count(transactions)
    print('Frequency with transactions', transactions)
    print(tree.get(2))
    print(tree.get(1))


================================================
FILE: wordseg/probability.py
================================================
#coding=utf-8

"""
Algorithms about probability
Author: 段凯强
"""

import math

def entropyOfList(ls):
    """
    Given a list of some items, compute entropy of the list
    The entropy is sum of -p[i]*log(p[i]) for every unique element i in the list, and p[i] is its frequency
    """
    elements = {}
    for e in ls:
        elements[e] = elements.get(e, 0) + 1
    length = float(len(ls))
    # if length is 0, which means one side of a word is empty, which is determinated, so entropy should be 0
    return length and sum([-v/length*math.log(v/length) for v in list(elements.values())])


================================================
FILE: wordseg/sequence.py
================================================
#coding=utf-8

"""
Algorithms for sequences
Author: 段凯强
"""

def dedup(ls):
    """
    deduplicate the given SORTED list
    """
    i, j = 0, 0
    while j < len(ls):
        if ls[j] == ls[i]:
            j += 1
        else:
            i += 1
            ls[i] = ls[j]
    return ls[:i + 1]

def genSubstr(string, n):
    """
    Generate all substrings of max length n for string
    """
    length = len(string)
    res = []
    for i in range(0, length):
        for j in range(i + 1, min(i + n + 1, length + 1)):
            res.append(string[i: j])
    return res

def genSubparts(string):
    """
    Partition a string into all possible two parts, e.g.
    given "abcd", generate [("a", "bcd"), ("ab", "cd"), ("abc", "d")]
    For string of length 1, return empty list
    """
    length = len(string)
    res = []
    for i in range(1, length):
        res.append((string[0:i], string[i:]))
    return res

def longestSubsequenceLength(s1, s2):
    n = len(s2) + 1
    cur = [0]*n
    next = [0]*n
    tmp = None
    for i in s1:
        for j in range(0, n):
            if j == 0: next[j] = 0
            else: next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j])
        tmp = next
        next = cur
        cur = tmp
    return cur[n - 1]

def longestSubsequence(s1, s2):
    n = len(s2) + 1
    cur = [0]*n
    next = [0]*n
    tmp = None
    __NONE, __UP, __LEFT, __NEW = 0, 1, 2, 3
    orientation = [[__NONE]*n]
    for i in s1:
        ori = []
        for j in range(0, n):
            if j == 0:
                next[j] = 0
                ori.append(__NONE)
            else:
                next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j])
                ori.append(__NEW if i == s2[j - 1] else (__LEFT if next[j - 1] > cur [j] else __UP))
        orientation.append(ori)
        tmp = next
        next = cur
        cur = tmp
    i, j, res = len(s1), n - 1, ''
    ori = orientation[i][j]
    while ori != __NONE:
        if ori == __UP: i -= 1
        elif ori == __LEFT: j -= 1
        elif ori == __NEW:
            i -= 1
            j -= 1
            res += s2[j]
        ori = orientation[i][j]
    return res[::-1]


================================================
FILE: wordseg/wordseg.py
================================================
#coding=utf-8

"""
Chinese word segmentation algorithm without corpus
Author: 段凯强
Reference: http://www.matrix67.com/blog/archives/5044
"""

import re

from . probability import entropyOfList
from . sequence import genSubparts, genSubstr


def indexOfSortedSuffix(doc, max_word_len):
    """
    Treat a suffix as an index where the suffix begins.
    Then sort these indexes by the suffixes.
    """
    indexes = []
    length = len(doc)
    for i in range(0, length):
        for j in range(i + 1, min(i + 1 + max_word_len, length + 1)):
            indexes.append((i, j))
    return sorted(indexes, key=lambda i_j: doc[i_j[0]:i_j[1]])


class WordInfo(object):
    """
    Store information of each word, including its freqency, left neighbors and right neighbors
    """
    def __init__(self, text):
        super(WordInfo, self).__init__()
        self.text = text
        self.freq = 0.0
        self.left = []
        self.right = []
        self.aggregation = 0

    def update(self, left, right):
        """
        Increase frequency of this word, then append left/right neighbors
        @param left a single character on the left side of this word
        @param right as left is, but on the right side
        """
        self.freq += 1
        if left: self.left.append(left)
        if right: self.right.append(right)

    def compute(self, length):
        """
        Compute frequency and entropy of this word
        @param length length of the document for training to get words
        """
        self.freq /= length
        self.left = entropyOfList(self.left)
        self.right = entropyOfList(self.right)

    def computeAggregation(self, words_dict):
        """
        Compute aggregation of this word
        @param words_dict frequency dict of all candidate words
        """
        parts = genSubparts(self.text)
        if len(parts) > 0:
            self.aggregation = min([self.freq/words_dict[p1_p2[0]].freq/words_dict[p1_p2[1]].freq for p1_p2 in parts])


class WordSegment(object):

    """
    Main class for Chinese word segmentation
    1. Generate words from a long enough document
    2. Do the segmentation work with the document
    """

    # if a word is combination of other shorter words, then treat it as a long word
    L = 0
    # if a word is combination of other shorter words, then treat it as the set of shortest words
    S = 1
    # if a word contains other shorter words, then return all possible results
    ALL = 2

    def __init__(self, doc, max_word_len=5, min_freq=0.00005, min_entropy=2.0, min_aggregation=50):
        super(WordSegment, self).__init__()
        self.max_word_len = max_word_len
        self.min_freq = min_freq
        self.min_entropy = min_entropy
        self.min_aggregation = min_aggregation
        self.word_infos = self.genWords(doc)
        # Result infomations, i.e., average data of all words
        word_count = float(len(self.word_infos))
        self.avg_len = sum([len(w.text) for w in self.word_infos])/word_count
        self.avg_freq = sum([w.freq for w in self.word_infos])/word_count
        self.avg_left_entropy = sum([w.left for w in self.word_infos])/word_count
        self.avg_right_entropy = sum([w.right for w in self.word_infos])/word_count
        self.avg_aggregation = sum([w.aggregation for w in self.word_infos])/word_count
        # Filter out the results satisfy all the requirements
        filter_func = lambda v: len(v.text) > 1 and v.aggregation > self.min_aggregation and\
                    v.freq > self.min_freq and v.left > self.min_entropy and v.right > self.min_entropy
        self.word_with_freq = [(w.text, w.freq) for w in list(filter(filter_func, self.word_infos))]
        self.words = [w[0] for w in self.word_with_freq]

    def genWords(self, doc):
        """
        Generate all candidate words with their frequency/entropy/aggregation informations
        @param doc the document used for words generation
        """
        pattern = re.compile('[\\s\\d,.<>/?:;\'\"[\\]{}()\\|~!@#$%^&*\\-_=+a-zA-Z，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+')
        doc = re.sub(pattern, ' ', doc)
        suffix_indexes = indexOfSortedSuffix(doc, self.max_word_len)
        word_cands = {}
        # compute frequency and neighbors
        for suf in suffix_indexes:
            word = doc[suf[0]:suf[1]]
            if word not in word_cands:
                word_cands[word] = WordInfo(word)
            word_cands[word].update(doc[suf[0] - 1:suf[0]], doc[suf[1]:suf[1] + 1])
        # compute probability and entropy
        length = len(doc)
        for k in word_cands:
            word_cands[k].compute(length)
        # compute aggregation of words whose length > 1
        values = sorted(list(word_cands.values()), key=lambda x: len(x.text))
        for v in values:
            if len(v.text) == 1: continue
            v.computeAggregation(word_cands)
        return sorted(values, key=lambda v: v.freq, reverse=True)

    def segSentence(self, sentence, method=ALL):
        """
        Segment a sentence with the words generated from a document
        @param sentence the sentence to be handled
        @param method segmentation method
        """
        i = 0
        res = []
        while i < len(sentence):
            if method == self.L or method == self.S:
                j_range = list(range(self.max_word_len, 0, -1)) if method == self.L else list(range(2, self.max_word_len + 1)) + [1]
                for j in j_range:
                    if j == 1 or sentence[i:i + j] in self.words:
                        res.append(sentence[i:i + j])
                        i += j
                        break
            else:
                to_inc = 1
                for j in range(2, self.max_word_len + 1):
                    if i + j <= len(sentence) and sentence[i:i + j] in self.words:
                        res.append(sentence[i:i + j])
                        if to_inc == 1: to_inc = j
                if to_inc == 1: res.append(sentence[i])
                i += to_inc
        return res


if __name__ == '__main__':
    doc = '十四是十四四十是四十，，十四不是四十，，，，四十不是十四'
    ws = WordSegment(doc, max_word_len=2, min_aggregation=1.2, min_entropy=0.4)
    print(' '.join(['%s:%f'%w for w in ws.word_with_freq]))
    print(' '.join(ws.words))
    print(' '.join(ws.segSentence(doc)))
    print('average len: ', ws.avg_len)
    print('average frequency: ', ws.avg_freq)
    print('average left entropy: ', ws.avg_left_entropy)
    print('average right entropy: ', ws.avg_right_entropy)
    print('average aggregation: ', ws.avg_aggregation)