[
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nenv/\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*,cover\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n"
  },
  {
    "path": "LICENSE",
    "content": "The MIT License (MIT)\n\nCopyright (c) 2015 Kaiqiang Dawn\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n\n"
  },
  {
    "path": "README.md",
    "content": "# ChineseWordSegmentation\nChinese word segmentation algorithm without corpus\n\n## Usage\n```\nfrom wordseg import WordSegment\ndoc = u'十四是十四四十是四十，十四不是四十，四十不是十四'\nws = WordSegment(doc, max_word_len=2, min_aggregation=1, min_entropy=0.5)\nws.segSentence(doc)\n```\n\nThis will generate words\n\n`十四 是 十四 四十 是 四十 ， 十四 不是 四十 ， 四十 不是 十四`\n\nIn fact, `doc` should be a long enough document string for better results. In that condition, the min_aggregation should be set far greater than 1, such as 50, and min_entropy should also be set greater than 0.5, such as 1.5.\n\nBesides, both input and output of this function should be decoded as unicode.\n\n`WordSegment.segSentence` has an optional argument `method`, with values `WordSegment.L`, `WordSegment.S` and `WordSegment.ALL`, means\n\n+ `WordSegment.L`: if a long word that is combinations of several shorter words found, given only the long word.\n+ `WordSegment.S`: given the several shorter words.\n+ `WordSegment.ALL`: given both the long and the shorters.\n\n## Reference\n\nThanks Matrix67's [article](http://www.matrix67.com/blog/archives/5044)"
  },
  {
    "path": "setup.py",
    "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\nfrom __future__ import absolute_import\nfrom __future__ import print_function\nfrom __future__ import unicode_literals\nimport pkg_resources\nfrom setuptools import setup, find_packages\nimport os\nimport codecs\nimport re\nimport sys\n\ndef read(*parts):\n    path = os.path.join(os.path.dirname(__file__), *parts)\n    with codecs.open(path, encoding='utf-8') as fobj:\n        return fobj.read()\n\ndef find_version(*file_paths):\n    version_file = read(*file_paths)\n    version_match = re.search(r\"^__version__ = ['\\\"]([^'\\\"]*)['\\\"]\",\n                              version_file, re.M)\n    if version_match:\n        return version_match.group(1)\n    raise RuntimeError(\"Unable to find version string.\")\n\nsetup(\n    name='wordseg',\n    version=find_version(\"wordseg\", \"__init__.py\"),\n    description='Chinese word segmentation algorithm without corpus',\n    author='段凯强',\n    author_email='',\n    license='MIT',\n    keywords='NLP,tokenizing,Chinese word segementation',\n    url='https://github.com/bung87/ChineseWordSegmentation',\n    packages = find_packages(),\n    package_dir={'wordseg': 'wordseg'},\n    classifiers=[\n        'Intended Audience :: Developers',\n        'License :: OSI Approved :: MIT License',\n        'Operating System :: OS Independent',\n        'Natural Language :: Chinese (Simplified)',\n        'Natural Language :: Chinese (Traditional)',\n        'Programming Language :: Python',\n        'Programming Language :: Python :: 2',\n        'Programming Language :: Python :: 2.6',\n        'Programming Language :: Python :: 2.7',\n        'Programming Language :: Python :: 3',\n        'Programming Language :: Python :: 3.2',\n        'Programming Language :: Python :: 3.3',\n        'Programming Language :: Python :: 3.4',\n        'Topic :: Text Processing',\n        'Topic :: Text Processing :: Indexing',\n        'Topic :: Text Processing :: Linguistic',\n      ]\n)\n"
  },
  {
    "path": "wordseg/__init__.py",
    "content": "__version__ = '0.1.0'\n\nfrom . import probability\nfrom . import sequence\nfrom . wordseg import WordSegment"
  },
  {
    "path": "wordseg/freqitem.py",
    "content": "#coding=utf-8\n\n\"\"\"\nA simple frequent itemset mining algorithm implementation\nAuthor: 段凯强\n\"\"\"\n\nimport itertools\n\nfrom wordseg.sequence import dedup\nfrom wordseg.hashtree import HashTree, sameNodes\nfrom functools import reduce\n\nclass FreqItem(object):\n    def __init__(self, transactions, sup_theta=.1):\n        self.transactions = [sorted(t) for t in [x for x in transactions if x]]\n        self.sup_theta = sup_theta*len(transactions)\n        self.freqset = []\n\n    def filterCandidates(self, cand):\n        \"\"\"\n        Build a HashTree with candidates cand, then count support of these candidates to filter out\n        all those that have support not lower than sup_theta\n        \"\"\"\n        hashtree = HashTree(cand)\n        hashtree.count(self.transactions)\n        return hashtree.getNodes(self.sup_theta)\n\n    def freqOneSet(self):\n        \"\"\"\n        Generate frequent 1-item sets\n        \"\"\"\n        one_item_cand = set()\n        for t in self.transactions:\n            for w in t:\n                one_item_cand.add(w)\n        return sorted(self.filterCandidates([[i] for i in one_item_cand]), key=lambda i: i[0].name)\n\n    def genNextCand(self, preItems):\n        \"\"\"\n        Generate next candidates by dynamic programming\n        Find range [i, j) such that items in this range have same prefix\n        e.g., [1,2,3,4] and [1,2,3,5] have same prefix, so they should be in one same range\n        Then, generate 2-combinations of these ranges as result\n        \"\"\"\n        res = []\n        i, j = 0, 0\n        while i < len(preItems):\n            if j < len(preItems) and sameNodes(preItems[j][:-1], preItems[i][:-1]):\n                j += 1\n            else:\n                res += [pair[0] + [pair[1][-1]] for pair in itertools.combinations(preItems[i:j], 2)]\n                i = j\n        return [[i.name for i in items] for items in res]\n\n    def genFreqItemSets(self):\n        \"\"\"\n        @return Frequent item sets with their frequency\n        \"\"\"\n        if self.freqset: return self.freqset\n        cur = self.freqOneSet()\n        freqKSet = []\n        while cur:\n            freqKSet.append(cur)\n            cur = self.filterCandidates(self.genNextCand(cur))\n        self.freqset = reduce(lambda res, x: res + x, freqKSet, [])\n        name_freq_pairs = [[(i.name, i.val) for i in items] for items in self.freqset[::-1]]\n        res = [list(zip(*items)) for items in name_freq_pairs]\n        return [(list(pair[0]), pair[1][-1]) for pair in res]\n\nif __name__ == '__main__':\n    transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7], [5,7,2]]\n    freqItem = FreqItem(transactions, sup_theta=.3)\n    print(freqItem.genFreqItemSets())\n"
  },
  {
    "path": "wordseg/hashtree.py",
    "content": "#coding=utf-8\n\n\"\"\"\nA simple implementation of Hash Tree\nAuthor: 段凯强\n\"\"\"\nfrom functools import reduce\n\nclass HashTreeNode(object):\n    def __init__(self, name=''):\n        self.val = 0\n        self.name = name\n        self.level = 0\n        self.children = {}\n\n    def addBag(self, bag):\n        \"\"\"\n        Note that bag must be sorted\n        \"\"\"\n        if bag:\n            node = self.children.get(bag[0], HashTreeNode(name=bag[0]))\n            node.addBag(bag[1:])\n            self.children[bag[0]] = node\n            self.level = len(bag)\n\n    def count(self, transaction):\n        \"\"\"\n        count the child who matches bag, suppose that current node matches\n        \"\"\"\n        if self.level == 0:\n            self.val += 1\n        elif self.level == 1:\n            for t in transaction:\n                if t in self.children: self.children[t].val += 1\n        else:\n            for i in range(0, len(transaction)):\n                t = transaction[i]\n                if t in self.children:\n                    self.children[t].count(transaction[i:])\n\n    def get(self, theta):\n        return [[c.name for c in items] for items in self.getNodes(theta)]\n        \"\"\"\n        if self.level == 0:\n            return [[self.name]] if self.val >= theta else None\n        else:\n            children_res = [self.children[i].get(theta) for i in sorted(self.children.keys())]\n            total = reduce(lambda res, x: res + x, filter(lambda x: x, children_res), [])\n            return map(lambda c: [self.name] + c, total)\n        \"\"\"\n\n    def getNodes(self, theta):\n        if self.level == 0:\n            return [[self]] if self.val >= theta else None\n        else:\n            children_res = [self.children[i].getNodes(theta) for i in sorted(self.children.keys())]\n            total = reduce(lambda res, x: res + x, [x for x in children_res if x], [])\n            return [[self] + c for c in total]\n\n    def __str__(self):\n        return '(%s : %s)'%(self.name, '; '.join([str(i) for i in list(self.children.values())]))\n\ndef sameNode(node1, node2):\n    return node1.name == node2.name\n\ndef sameNodes(nodes1, nodes2):\n    func = lambda n: n.name\n    return list(map(func, nodes1)) == list(map(func, nodes2))\n\n\n\nclass HashTree(object):\n    \"\"\"\n    Note that all bags must be sorted\n    \"\"\"\n    def __init__(self, bags):\n        self.root = HashTreeNode()\n        self.root.val = 0\n        for b in bags:\n            if b: self.root.addBag(b)\n\n    def count(self, transactions):\n        for t in transactions: self.root.count(t)\n\n    def get(self, theta):\n        res = [c[1:] for c in self.root.get(theta)]\n        return [] if res == [[]] else res\n\n    def getNodes(self, theta):\n        res = [c[1:] for c in self.root.getNodes(theta)]\n        return [] if res == [[]] else res\n\n    def __str__(self):\n        return str(self.root)\n\nif __name__ == '__main__':\n    to_count = [[1,2], [2,4], [1,3], [1,5], [3,4], [2,7], [6,8]]\n    tree = HashTree(to_count)\n    transactions = [[1,2,3],[1,2,4],[2,4,6,8],[1,3,5,7]]\n    tree.count(transactions)\n    print('Frequency with transactions', transactions)\n    print(tree.get(2))\n    print(tree.get(1))\n\n\n"
  },
  {
    "path": "wordseg/probability.py",
    "content": "#coding=utf-8\n\n\"\"\"\nAlgorithms about probability\nAuthor: 段凯强\n\"\"\"\n\nimport math\n\ndef entropyOfList(ls):\n    \"\"\"\n    Given a list of some items, compute entropy of the list\n    The entropy is sum of -p[i]*log(p[i]) for every unique element i in the list, and p[i] is its frequency\n    \"\"\"\n    elements = {}\n    for e in ls:\n        elements[e] = elements.get(e, 0) + 1\n    length = float(len(ls))\n    # if length is 0, which means one side of a word is empty, which is determinated, so entropy should be 0\n    return length and sum([-v/length*math.log(v/length) for v in list(elements.values())])\n\n\n\n"
  },
  {
    "path": "wordseg/sequence.py",
    "content": "#coding=utf-8\n\n\"\"\"\nAlgorithms for sequences\nAuthor: 段凯强\n\"\"\"\n\ndef dedup(ls):\n    \"\"\"\n    deduplicate the given SORTED list\n    \"\"\"\n    i, j = 0, 0\n    while j < len(ls):\n        if ls[j] == ls[i]:\n            j += 1\n        else:\n            i += 1\n            ls[i] = ls[j]\n    return ls[:i + 1]\n\ndef genSubstr(string, n):\n    \"\"\"\n    Generate all substrings of max length n for string\n    \"\"\"\n    length = len(string)\n    res = []\n    for i in range(0, length):\n        for j in range(i + 1, min(i + n + 1, length + 1)):\n            res.append(string[i: j])\n    return res\n\ndef genSubparts(string):\n    \"\"\"\n    Partition a string into all possible two parts, e.g.\n    given \"abcd\", generate [(\"a\", \"bcd\"), (\"ab\", \"cd\"), (\"abc\", \"d\")]\n    For string of length 1, return empty list\n    \"\"\"\n    length = len(string)\n    res = []\n    for i in range(1, length):\n        res.append((string[0:i], string[i:]))\n    return res\n\ndef longestSubsequenceLength(s1, s2):\n    n = len(s2) + 1\n    cur = [0]*n\n    next = [0]*n\n    tmp = None\n    for i in s1:\n        for j in range(0, n):\n            if j == 0: next[j] = 0\n            else: next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j])\n        tmp = next\n        next = cur\n        cur = tmp\n    return cur[n - 1]\n\ndef longestSubsequence(s1, s2):\n    n = len(s2) + 1\n    cur = [0]*n\n    next = [0]*n\n    tmp = None\n    __NONE, __UP, __LEFT, __NEW = 0, 1, 2, 3\n    orientation = [[__NONE]*n]\n    for i in s1:\n        ori = []\n        for j in range(0, n):\n            if j == 0:\n                next[j] = 0\n                ori.append(__NONE)\n            else:\n                next[j] = cur[j - 1] + 1 if i == s2[j - 1] else max(next[j - 1], cur[j])\n                ori.append(__NEW if i == s2[j - 1] else (__LEFT if next[j - 1] > cur [j] else __UP))\n        orientation.append(ori)\n        tmp = next\n        next = cur\n        cur = tmp\n    i, j, res = len(s1), n - 1, ''\n    ori = orientation[i][j]\n    while ori != __NONE:\n        if ori == __UP: i -= 1\n        elif ori == __LEFT: j -= 1\n        elif ori == __NEW:\n            i -= 1\n            j -= 1\n            res += s2[j]\n        ori = orientation[i][j]\n    return res[::-1]\n\n"
  },
  {
    "path": "wordseg/wordseg.py",
    "content": "#coding=utf-8\n\n\"\"\"\nChinese word segmentation algorithm without corpus\nAuthor: 段凯强\nReference: http://www.matrix67.com/blog/archives/5044\n\"\"\"\n\nimport re\n\nfrom . probability import entropyOfList\nfrom . sequence import genSubparts, genSubstr\n\n\n\n\ndef indexOfSortedSuffix(doc, max_word_len):\n    \"\"\"\n    Treat a suffix as an index where the suffix begins.\n    Then sort these indexes by the suffixes.\n    \"\"\"\n    indexes = []\n    length = len(doc)\n    for i in range(0, length):\n        for j in range(i + 1, min(i + 1 + max_word_len, length + 1)):\n            indexes.append((i, j))\n    return sorted(indexes, key=lambda i_j: doc[i_j[0]:i_j[1]])\n\n\nclass WordInfo(object):\n    \"\"\"\n    Store information of each word, including its freqency, left neighbors and right neighbors\n    \"\"\"\n    def __init__(self, text):\n        super(WordInfo, self).__init__()\n        self.text = text\n        self.freq = 0.0\n        self.left = []\n        self.right = []\n        self.aggregation = 0\n\n    def update(self, left, right):\n        \"\"\"\n        Increase frequency of this word, then append left/right neighbors\n        @param left a single character on the left side of this word\n        @param right as left is, but on the right side\n        \"\"\"\n        self.freq += 1\n        if left: self.left.append(left)\n        if right: self.right.append(right)\n\n    def compute(self, length):\n        \"\"\"\n        Compute frequency and entropy of this word\n        @param length length of the document for training to get words\n        \"\"\"\n        self.freq /= length\n        self.left = entropyOfList(self.left)\n        self.right = entropyOfList(self.right)\n\n    def computeAggregation(self, words_dict):\n        \"\"\"\n        Compute aggregation of this word\n        @param words_dict frequency dict of all candidate words\n        \"\"\"\n        parts = genSubparts(self.text)\n        if len(parts) > 0:\n            self.aggregation = min([self.freq/words_dict[p1_p2[0]].freq/words_dict[p1_p2[1]].freq for p1_p2 in parts])\n\n\n\nclass WordSegment(object):\n\n    \"\"\"\n    Main class for Chinese word segmentation\n    1. Generate words from a long enough document\n    2. Do the segmentation work with the document\n    \"\"\"\n\n    # if a word is combination of other shorter words, then treat it as a long word\n    L = 0\n    # if a word is combination of other shorter words, then treat it as the set of shortest words\n    S = 1\n    # if a word contains other shorter words, then return all possible results\n    ALL = 2\n\n    def __init__(self, doc, max_word_len=5, min_freq=0.00005, min_entropy=2.0, min_aggregation=50):\n        super(WordSegment, self).__init__()\n        self.max_word_len = max_word_len\n        self.min_freq = min_freq\n        self.min_entropy = min_entropy\n        self.min_aggregation = min_aggregation\n        self.word_infos = self.genWords(doc)\n        # Result infomations, i.e., average data of all words\n        word_count = float(len(self.word_infos))\n        self.avg_len = sum([len(w.text) for w in self.word_infos])/word_count\n        self.avg_freq = sum([w.freq for w in self.word_infos])/word_count\n        self.avg_left_entropy = sum([w.left for w in self.word_infos])/word_count\n        self.avg_right_entropy = sum([w.right for w in self.word_infos])/word_count\n        self.avg_aggregation = sum([w.aggregation for w in self.word_infos])/word_count\n        # Filter out the results satisfy all the requirements\n        filter_func = lambda v: len(v.text) > 1 and v.aggregation > self.min_aggregation and\\\n                    v.freq > self.min_freq and v.left > self.min_entropy and v.right > self.min_entropy\n        self.word_with_freq = [(w.text, w.freq) for w in list(filter(filter_func, self.word_infos))]\n        self.words = [w[0] for w in self.word_with_freq]\n\n    def genWords(self, doc):\n        \"\"\"\n        Generate all candidate words with their frequency/entropy/aggregation informations\n        @param doc the document used for words generation\n        \"\"\"\n        pattern = re.compile('[\\\\s\\\\d,.<>/?:;\\'\\\"[\\\\]{}()\\\\|~!@#$%^&*\\\\-_=+a-zA-Z，。《》、？：；“”‘’｛｝【】（）…￥！—┄－]+')\n        doc = re.sub(pattern, ' ', doc)\n        suffix_indexes = indexOfSortedSuffix(doc, self.max_word_len)\n        word_cands = {}\n        # compute frequency and neighbors\n        for suf in suffix_indexes:\n            word = doc[suf[0]:suf[1]]\n            if word not in word_cands:\n                word_cands[word] = WordInfo(word)\n            word_cands[word].update(doc[suf[0] - 1:suf[0]], doc[suf[1]:suf[1] + 1])\n        # compute probability and entropy\n        length = len(doc)\n        for k in word_cands:\n            word_cands[k].compute(length)\n        # compute aggregation of words whose length > 1\n        values = sorted(list(word_cands.values()), key=lambda x: len(x.text))\n        for v in values:\n            if len(v.text) == 1: continue\n            v.computeAggregation(word_cands)\n        return sorted(values, key=lambda v: v.freq, reverse=True)\n\n    def segSentence(self, sentence, method=ALL):\n        \"\"\"\n        Segment a sentence with the words generated from a document\n        @param sentence the sentence to be handled\n        @param method segmentation method\n        \"\"\"\n        i = 0\n        res = []\n        while i < len(sentence):\n            if method == self.L or method == self.S:\n                j_range = list(range(self.max_word_len, 0, -1)) if method == self.L else list(range(2, self.max_word_len + 1)) + [1]\n                for j in j_range:\n                    if j == 1 or sentence[i:i + j] in self.words:\n                        res.append(sentence[i:i + j])\n                        i += j\n                        break\n            else:\n                to_inc = 1\n                for j in range(2, self.max_word_len + 1):\n                    if i + j <= len(sentence) and sentence[i:i + j] in self.words:\n                        res.append(sentence[i:i + j])\n                        if to_inc == 1: to_inc = j\n                if to_inc == 1: res.append(sentence[i])\n                i += to_inc\n        return res\n\n\nif __name__ == '__main__':\n    doc = '十四是十四四十是四十，，十四不是四十，，，，四十不是十四'\n    ws = WordSegment(doc, max_word_len=2, min_aggregation=1.2, min_entropy=0.4)\n    print(' '.join(['%s:%f'%w for w in ws.word_with_freq]))\n    print(' '.join(ws.words))\n    print(' '.join(ws.segSentence(doc)))\n    print('average len: ', ws.avg_len)\n    print('average frequency: ', ws.avg_freq)\n    print('average left entropy: ', ws.avg_left_entropy)\n    print('average right entropy: ', ws.avg_right_entropy)\n    print('average aggregation: ', ws.avg_aggregation)\n\n\n"
  }
]