[
  {
    "path": ".gitignore",
    "content": "corpus/*\n__pycache__/*\n*.pyc"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2017 dzkang\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "README.md",
    "content": "# 基于TF-IDF的中文关键词提取\n\n## requirements\n\n默认环境python3，需要结巴分词器的支持\n\n```bash\n$ pip install jieba\n```\n\n## IDF(逆文档频率)生成\n\n用法：\n\n```bash\n$ python gen_idf.py -i <inputdir> -o <outputfile>\n```\n\n- `-i <inputdir>`   ： 语料库目录，程序会扫描目录下的所有文件\n- `-o <outputfile>` ： 保存idf到指定文件\n\n## TF-IDF关键词提取\n\n用法：\n\n```bash\n$ python tfidf.py -i <idffile> -d <document> -t <topK>\n```\n- `-i <idffile>`  ： idf文件路径\n- `-d <document>` ： 所需处理文档路径\n- `-t <topK>`     ： 返回topK结果\n\n### 示例\n\n```bash\n$ python tfidf.py -i idf.txt -d test.txt -t 20\n```\n\n返回结果：\n\n```\n核\n处理器\n服务器\n系统核心\n封装\n系列\n插槽\n核心\n主频\n产品\n伊斯坦布尔\n英特尔\n功耗\n多处理器\n低仅\n折合\n浮点运算\n性能\n构建\n吹起\n```\n\n> 注：该repo中提供的idf.txt由清华NLP组的新闻数据集训练获得。\n"
  },
  {
    "path": "gen_idf.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport os\nimport math\nimport re\nimport datetime\nimport sys, getopt\n\nfrom segmenter import segment\n\nclass MyDocuments(object):    # memory efficient data streaming\n    def __init__(self, dirname):\n        self.dirname = dirname\n        if not os.path.isdir(dirname):\n            print(dirname, '- not a directory!')\n            sys.exit()\n\n    def __iter__(self):\n        for dirfile in os.walk(self.dirname):\n            for fname in dirfile[2]:\n                text = open(os.path.join(dirfile[0], fname),\n                            'r', encoding='utf-8', errors='ignore').read()\n                yield segment(text)   # time consuming\n\n\ndef main(argv):   # idf generator\n    inputdir = ''\n    outputfile = ''\n\n    usage = 'usage: python gen_idf.py -i <inputdir> -o <outputfile>'\n    if len(argv) < 4:\n        print(usage)\n        sys.exit()\n    try:\n        opts, args = getopt.getopt(argv,\"hi:o:\",[\"idir=\",\"ofile=\"])\n    except getopt.GetoptError:\n        print(usage)\n        sys.exit(2)\n\n    for opt, arg in opts:   # parsing arguments\n        if opt == '-h':\n            print(usage)\n            sys.exit()\n        elif opt in (\"-i\", \"--idir\"):\n            inputdir = arg\n        elif opt in (\"-o\", \"--ofile\"):\n            outputfile = arg\n\n    documents = MyDocuments(inputdir)\n\n    ignored = {'', ' ', '', '。', '：', '，', '）', '（', '！', '?', '”', '“'}\n    id_freq = {}\n    i = 0\n    for doc in documents:\n        doc = set(x for x in doc if x not in ignored)\n        for x in doc:\n            id_freq[x] = id_freq.get(x, 0) + 1\n        if i % 1000 == 0:\n            print('Documents processed: ', i, ', time: ',\n                datetime.datetime.now())\n        i += 1\n\n    with open(outputfile, 'w', encoding='utf-8') as f:\n        for key, value in id_freq.items():\n            f.write(key + ' ' + str(math.log(i / value, 2)) + '\\n')\n\n\nif __name__ == \"__main__\":\n   main(sys.argv[1:])\n"
  },
  {
    "path": "segmenter.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nimport jieba\nimport re\n\ndef segment(sentence, cut_all=False):\n    sentence = sentence.replace('\\n', '').replace('\\u3000', '').replace('\\u00A0', '')\n    sentence = ' '.join(jieba.cut(sentence, cut_all=cut_all))\n    return re.sub('[a-zA-Z0-9.。:：,，)）(（！!??”“\\\"]', '', sentence).split()\n"
  },
  {
    "path": "test.txt",
    "content": "AMD力推812核服务器处理器反攻英特尔\n　　AMD今日正式推出最新的8核心及12核心系列处理器产品，从而正式在服务器领域向英特尔吹起了进攻的号角。\n　　AMD的8核和12核服务器处理器都采用了新的45纳米设计，而且也都是由两块处理器die封装在一起构建，其中12核心处理器正是基于此前曝光的Magny-Cours核心，也就是两个6核伊斯坦布尔核心封装在一起，而8核处理器则是由两颗4核处理器die封装在一起构建。\n　　新推出的8核和12核处理器将支持全新的G34插槽，可提供更新的I/O技术，另外由于可以支持四条DDR3内存通道因此每颗处理器可以支持多达12条内存插槽。\n　　此次新推的8核和12核处理器产品将会隶属于Opteron 6100系列，最低起始主频为1.8GHz，其中8核最低版本型号为Opteron 6124 HE，而该系列最高版本则为主频2.3GHz的12核Opteron 6176 SE。在Opteron 6100系列里，1.8GHz的8核Opteron 6124 HE功耗较低仅为65W，具体的售价则为455美元，折合人民币3100元出头。主频2.3GHz的12核Opteron 6176 SE功耗为105W，售价为1386美元，折合人民币约为9466元。其他产品的规格和价格多介于这两款产品之间。\n　　性能方面，AMD Opteron 6100系列比此前的6核伊斯坦布尔处理器要强悍很多，按照AMD方面的说法整数运算性能提升达88%，同时浮点运算性能更是提升了119%之多。Opteron 6000系列服务器平台主要将配备四个或者两个插槽，也就是说入门级系统核心数量为16个，而高阶版系统核心数量可达48个。\n　　与AMD相对的是英特尔也正计划针对多处理器服务器市场推出一款8核心的芯片产品，这款产品也被称为“Nehalem-EX”，这款产品应该也已经离正式上市不远。\n"
  },
  {
    "path": "tfidf.py",
    "content": "#!/usr/bin/python\n# -*- coding: utf-8 -*-\n\nfrom segmenter import segment\nimport sys, getopt\n\n\nclass IDFLoader(object):\n    def __init__(self, idf_path):\n        self.idf_path = idf_path\n        self.idf_freq = {}     # idf\n        self.mean_idf = 0.0    # 均值\n        self.load_idf()\n\n    def load_idf(self):       # 从文件中载入idf\n        cnt = 0\n        with open(self.idf_path, 'r', encoding='utf-8') as f:\n            for line in f:\n                try:\n                    word, freq = line.strip().split(' ')\n                    cnt += 1\n                except Exception as e:\n                    pass\n                self.idf_freq[word] = float(freq)\n\n        print('Vocabularies loaded: %d' % cnt)\n        self.mean_idf = sum(self.idf_freq.values()) / cnt\n\n\nclass TFIDF(object):\n    def __init__(self, idf_path):\n        self.idf_loader = IDFLoader(idf_path)\n        self.idf_freq = self.idf_loader.idf_freq\n        self.mean_idf = self.idf_loader.mean_idf\n\n    def extract_keywords(self, sentence, topK=20):    # 提取关键词\n        # 过滤\n        seg_list = segment(sentence)\n\n        freq = {}\n        for w in seg_list:\n            freq[w] = freq.get(w, 0.0) + 1.0\n        total = sum(freq.values())\n\n        for k in freq:   # 计算 TF-IDF\n            freq[k] *= self.idf_freq.get(k, self.mean_idf) / total\n\n        tags = sorted(freq, key=freq.__getitem__, reverse=True)  # 排序\n\n        if topK:\n            return tags[:topK]\n        else:\n            return tags\n\ndef main(argv):\n    idffile = ''\n    document = ''\n    topK = None\n\n    usage = 'usage: python tfidf.py -i <idffile> -d <document> -t <topK>'\n    if len(argv) < 4:\n        print(usage)\n        sys.exit()\n    try:\n        opts, args = getopt.getopt(argv,\"hi:d:t:\",\n            [\"idffile=\",\"document=\", \"topK=\"])\n    except getopt.GetoptError:\n        print(usage)\n        sys.exit(2)\n\n    for opt, arg in opts:   # parsing arguments\n        if opt == '-h':\n            print(usage)\n            sys.exit()\n        elif opt in (\"-i\", \"--idffile\"):\n            idffile = arg\n        elif opt in (\"-d\", \"--document\"):\n            document = arg\n        elif opt in (\"-t\", \"--topK\"):\n            topK = int(arg)\n\n    tdidf = TFIDF(idffile)\n    sentence = open(document, 'r', encoding='utf-8', errors='ignore').read()\n    tags = tdidf.extract_keywords(sentence, topK)\n\n    for tag in tags:\n        print(tag)\n\n\nif __name__ == \"__main__\":\n    main(sys.argv[1:])\n"
  }
]