Repository: gaussic/tf-idf-keyword Branch: master Commit: 9e666d1ad73b Files: 8 Total size: 33.6 MB Directory structure: gitextract_61wj4h_k/ ├── .gitignore ├── LICENSE ├── README.md ├── gen_idf.py ├── idf.txt ├── segmenter.py ├── test.txt └── tfidf.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ corpus/* __pycache__/* *.pyc ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2017 dzkang Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # 基于TF-IDF的中文关键词提取 ## requirements 默认环境python3，需要结巴分词器的支持 ```bash $ pip install jieba ``` ## IDF(逆文档频率)生成用法： ```bash $ python gen_idf.py -i -o ``` - `-i ` ：语料库目录，程序会扫描目录下的所有文件 - `-o ` ：保存idf到指定文件 ## TF-IDF关键词提取用法： ```bash $ python tfidf.py -i -d -t ``` - `-i ` ： idf文件路径 - `-d ` ：所需处理文档路径 - `-t ` ：返回topK结果 ### 示例 ```bash $ python tfidf.py -i idf.txt -d test.txt -t 20 ``` 返回结果： ``` 核处理器服务器系统核心封装系列插槽核心主频产品伊斯坦布尔英特尔功耗多处理器低仅折合浮点运算性能构建吹起 ``` > 注：该repo中提供的idf.txt由清华NLP组的新闻数据集训练获得。 ================================================ FILE: gen_idf.py ================================================ #!/usr/bin/python # -*- coding: utf-8 -*- import os import math import re import datetime import sys, getopt from segmenter import segment class MyDocuments(object): # memory efficient data streaming def __init__(self, dirname): self.dirname = dirname if not os.path.isdir(dirname): print(dirname, '- not a directory!') sys.exit() def __iter__(self): for dirfile in os.walk(self.dirname): for fname in dirfile[2]: text = open(os.path.join(dirfile[0], fname), 'r', encoding='utf-8', errors='ignore').read() yield segment(text) # time consuming def main(argv): # idf generator inputdir = '' outputfile = '' usage = 'usage: python gen_idf.py -i -o ' if len(argv) < 4: print(usage) sys.exit() try: opts, args = getopt.getopt(argv,"hi:o:",["idir=","ofile="]) except getopt.GetoptError: print(usage) sys.exit(2) for opt, arg in opts: # parsing arguments if opt == '-h': print(usage) sys.exit() elif opt in ("-i", "--idir"): inputdir = arg elif opt in ("-o", "--ofile"): outputfile = arg documents = MyDocuments(inputdir) ignored = {'', ' ', '', '。', '：', '，', '）', '（', '！', '?', '”', '“'} id_freq = {} i = 0 for doc in documents: doc = set(x for x in doc if x not in ignored) for x in doc: id_freq[x] = id_freq.get(x, 0) + 1 if i % 1000 == 0: print('Documents processed: ', i, ', time: ', datetime.datetime.now()) i += 1 with open(outputfile, 'w', encoding='utf-8') as f: for key, value in id_freq.items(): f.write(key + ' ' + str(math.log(i / value, 2)) + '\n') if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: idf.txt ================================================ [File too large to display: 33.6 MB] ================================================ FILE: segmenter.py ================================================ #!/usr/bin/python # -*- coding: utf-8 -*- import jieba import re def segment(sentence, cut_all=False): sentence = sentence.replace('\n', '').replace('\u3000', '').replace('\u00A0', '') sentence = ' '.join(jieba.cut(sentence, cut_all=cut_all)) return re.sub('[a-zA-Z0-9.。:：,，)）(（！!??”“\"]', '', sentence).split() ================================================ FILE: test.txt ================================================ AMD力推812核服务器处理器反攻英特尔　　AMD今日正式推出最新的8核心及12核心系列处理器产品，从而正式在服务器领域向英特尔吹起了进攻的号角。　　AMD的8核和12核服务器处理器都采用了新的45纳米设计，而且也都是由两块处理器die封装在一起构建，其中12核心处理器正是基于此前曝光的Magny-Cours核心，也就是两个6核伊斯坦布尔核心封装在一起，而8核处理器则是由两颗4核处理器die封装在一起构建。　　新推出的8核和12核处理器将支持全新的G34插槽，可提供更新的I/O技术，另外由于可以支持四条DDR3内存通道因此每颗处理器可以支持多达12条内存插槽。　　此次新推的8核和12核处理器产品将会隶属于Opteron 6100系列，最低起始主频为1.8GHz，其中8核最低版本型号为Opteron 6124 HE，而该系列最高版本则为主频2.3GHz的12核Opteron 6176 SE。在Opteron 6100系列里，1.8GHz的8核Opteron 6124 HE功耗较低仅为65W，具体的售价则为455美元，折合人民币3100元出头。主频2.3GHz的12核Opteron 6176 SE功耗为105W，售价为1386美元，折合人民币约为9466元。其他产品的规格和价格多介于这两款产品之间。　　性能方面，AMD Opteron 6100系列比此前的6核伊斯坦布尔处理器要强悍很多，按照AMD方面的说法整数运算性能提升达88%，同时浮点运算性能更是提升了119%之多。Opteron 6000系列服务器平台主要将配备四个或者两个插槽，也就是说入门级系统核心数量为16个，而高阶版系统核心数量可达48个。　　与AMD相对的是英特尔也正计划针对多处理器服务器市场推出一款8核心的芯片产品，这款产品也被称为“Nehalem-EX”，这款产品应该也已经离正式上市不远。 ================================================ FILE: tfidf.py ================================================ #!/usr/bin/python # -*- coding: utf-8 -*- from segmenter import segment import sys, getopt class IDFLoader(object): def __init__(self, idf_path): self.idf_path = idf_path self.idf_freq = {} # idf self.mean_idf = 0.0 # 均值 self.load_idf() def load_idf(self): # 从文件中载入idf cnt = 0 with open(self.idf_path, 'r', encoding='utf-8') as f: for line in f: try: word, freq = line.strip().split(' ') cnt += 1 except Exception as e: pass self.idf_freq[word] = float(freq) print('Vocabularies loaded: %d' % cnt) self.mean_idf = sum(self.idf_freq.values()) / cnt class TFIDF(object): def __init__(self, idf_path): self.idf_loader = IDFLoader(idf_path) self.idf_freq = self.idf_loader.idf_freq self.mean_idf = self.idf_loader.mean_idf def extract_keywords(self, sentence, topK=20): # 提取关键词 # 过滤 seg_list = segment(sentence) freq = {} for w in seg_list: freq[w] = freq.get(w, 0.0) + 1.0 total = sum(freq.values()) for k in freq: # 计算 TF-IDF freq[k] *= self.idf_freq.get(k, self.mean_idf) / total tags = sorted(freq, key=freq.__getitem__, reverse=True) # 排序 if topK: return tags[:topK] else: return tags def main(argv): idffile = '' document = '' topK = None usage = 'usage: python tfidf.py -i -d -t ' if len(argv) < 4: print(usage) sys.exit() try: opts, args = getopt.getopt(argv,"hi:d:t:", ["idffile=","document=", "topK="]) except getopt.GetoptError: print(usage) sys.exit(2) for opt, arg in opts: # parsing arguments if opt == '-h': print(usage) sys.exit() elif opt in ("-i", "--idffile"): idffile = arg elif opt in ("-d", "--document"): document = arg elif opt in ("-t", "--topK"): topK = int(arg) tdidf = TFIDF(idffile) sentence = open(document, 'r', encoding='utf-8', errors='ignore').read() tags = tdidf.extract_keywords(sentence, topK) for tag in tags: print(tag) if __name__ == "__main__": main(sys.argv[1:])