Repository: hrwhisper/SpamMessage Branch: master Commit: 754d3a74c626 Files: 23 Total size: 80.6 MB Directory structure: gitextract_y8xnqj4p/ ├── .gitignore ├── classifier/ │ ├── LogisticRegression.py │ ├── NaiveBayesian.py │ ├── Perceptron.py │ └── __init__.py ├── data/ │ ├── 不带标签短信.txt │ └── 带标签短信.txt ├── judgeSpamMessage.py ├── model/ │ ├── Bayes_sklearn.pkl │ ├── LogisticRegression.pkl │ ├── Logistic_sklearn.pkl │ ├── NaiveBayesian.pkl │ ├── Perceptron.pkl │ ├── SVM_sklearn.pkl │ ├── __init__.py │ └── vsm.pkl ├── model_manage.py ├── readme.md ├── test.py ├── test_jieba.py ├── test_judge.py ├── test_judge2.py └── token_and_save_to_file.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ .idea *.pyc *~ *.swp ================================================ FILE: classifier/LogisticRegression.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/10/16 # @Author : hrwhisper import random import sklearn import numpy as np from sklearn.utils.extmath import safe_sparse_dot class LogisticRegression(sklearn.base.BaseEstimator): def __init__(self, alpha=0.1, max_iter=100): # 0.02 200 0.9927425 self.alpha = alpha self.max_iter = max_iter self.weights = None def _sigmoid(self, x): return .5 * (1 + np.tanh(.5 * x)) # 1.0 / (1 + np.exp(-x)) def fit(self, X, y): """ :param X: sparse matrix(n_samples,n_features) the training feature vector :param y: array-like(n_samples,) Target vector relative to X. :return: self """ m, n = X.shape target = np.array(y).reshape((m, 1)) self.weights = np.ones((n, 1)) for _ in range(self.max_iter): h = self._sigmoid(safe_sparse_dot(X, self.weights)) # X * weights error = target - h self.weights += self.alpha * safe_sparse_dot(X.T, error) # alpha * X.T * error return self def predict(self, X): return (self._sigmoid(safe_sparse_dot(X, self.weights)) > 0.5).ravel().astype('int') if __name__ == '__main__': a = [1, 2, 3] b = [4, 5, 6] a = np.array(a) b = np.array(b) print(a, b) a = a.reshape((-1, 1)) print(a) print(a - b) ================================================ FILE: classifier/NaiveBayesian.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2016/11/29 22:04 # @Author : wqs # @File : NaiveBayesian.py import sklearn import numpy as np from scipy.sparse import csr_matrix from sklearn.utils.extmath import safe_sparse_dot from sklearn.preprocessing import LabelBinarizer class NaiveBayesian(sklearn.base.BaseEstimator): def __init__(self, alpha=1.0): self.alpha = alpha self.classes = None self.class_log_prior = None self.feature_log_prob = None def fit(self, X, y): _, n = X.shape labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes = labelbin.classes_ Y = np.concatenate((1 - Y, Y), axis=1).astype(np.float64) class_count = np.zeros(2, dtype=np.float64) feature_count = np.zeros((2, n), dtype=np.float64) feature_count += safe_sparse_dot(Y.T, X) # count frequency by y.T * X class_count += Y.sum(axis=0) smoothed_fc = feature_count + self.alpha smoothed_cc = smoothed_fc.sum(axis=1) self.feature_log_prob = (np.log(smoothed_fc) - np.log(smoothed_cc.reshape(-1, 1))) # self.class_log_prior = np.zeros(2) - np.log(2) self.class_log_prior = np.log(class_count / sum(class_count)) return self def predict(self, X): jll = safe_sparse_dot(X, self.feature_log_prob.T) + self.class_log_prior return self.classes[np.argmax(jll, axis=1)] ================================================ FILE: classifier/Perceptron.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/11/1 # @Author : hrwhisper import numpy as np import sklearn from sklearn.utils.extmath import safe_sparse_dot class Perceptron(sklearn.base.BaseEstimator): def __init__(self, alpha=0.1, max_iter=100): self.threshold = 0.5 self.alpha = alpha self.max_iter = max_iter self.weights = None def fit(self, X, y): """ :param X: sparse matrix(n_samples,n_features) the training feature vector :param y: array-like(n_samples,) Target vector relative to X. :return: self """ m, n = X.shape target = np.array(y).reshape((m, 1)) self.weights = np.ones((n, 1)) for _ in range(self.max_iter): h = safe_sparse_dot(X, self.weights) > self.threshold error = target - h self.weights += self.alpha * safe_sparse_dot(X.T, error) return self def predict(self, X): return (safe_sparse_dot(X, self.weights) > self.threshold).ravel().astype('int') ================================================ FILE: classifier/__init__.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/10/16 # @Author : hrwhisper ================================================ FILE: data/不带标签短信.txt ================================================ [File too large to display: 13.5 MB] ================================================ FILE: data/带标签短信.txt ================================================ [File too large to display: 55.5 MB] ================================================ FILE: judgeSpamMessage.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/12/3 # @Author : hrwhisper import codecs import sys from collections import Counter from optparse import OptionParser from multiprocessing import Pool import jieba from sklearn.externals import joblib from model_manage import BowTransform import time def token(x): return Counter(jieba.lcut(x)) if __name__ == "__main__": parser = OptionParser() parser.add_option('-c', '--classifier', dest="cls_name", type='string', default='p', help="define the classifier you want to use: \t\t\n" "p => Perceptron,\t\t\t\t\t\n" "lr => LogisticRegression,\t\t\t\t\t\n" "nb => NaiveBayesian,\t\t\t\t\t\n" "svm => SVM(sklearn),\t\t\t\t\t\n" "lrs => LogisticRegression(sklearn),\t\t\t\t\t\n" "nbs => NaiveBayesian(sklearn),") parser.add_option('-i', '--input', dest="input_filename", type='string', default='./data/不带标签短信.txt', help="input file name") parser.add_option('-o', '--output', dest="output_filename", type='string', default='./data/result.txt', help="output file name") options, args = parser.parse_args() # classifiers = { 'p': './model/Perceptron.pkl', # 0.1 2000 'lr': './model/LogisticRegression.pkl', # 0.2 2000 'nb': './model/NaiveBayesian.pkl', # 0.00241 'svm': './model/SVM_sklearn.pkl', 'lrs': './model/Logistic_sklearn.pkl', 'nbs': './model/Bayes_sklearn.pkl' } # cls_name = options.cls_name file_path = options.input_filename out_path = options.output_filename if cls_name not in classifiers.keys(): print('check your classifiers name, you can use -h for help') sys.exit() start = time.time() jieba.initialize() try: with codecs.open(file_path, 'r', 'utf-8') as f: data = [line.strip() for line in f.read().split('\n')] if data[-1] == '': data.pop() except FileNotFoundError as e: print('Please check your input filename') sys.exit() # data = [Counter(d) for d in map(jieba.cut, data)] data = Pool().map(token, data) print('end token in {}\n'.format(time.time() - start)) cv = BowTransform.load_vsm() data = cv.transform(data) print('end bow in {}\n'.format(time.time() - start)) cls = joblib.load(classifiers[cls_name]) predicted = cls.predict(data) # print(predicted) with open(out_path, 'w+') as f: for x in predicted: f.write(str(x) + '\n') print('task complete. total time: {}\n using {}'.format(time.time() - start, cls)) ================================================ FILE: model/Bayes_sklearn.pkl ================================================ [File too large to display: 11.6 MB] ================================================ FILE: model/__init__.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/11/30 # @Author : hrwhisper ================================================ FILE: model_manage.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/12/1 # @Author : hrwhisper from sklearn.externals import joblib class BowTransform(object): default_path = './model/vsm.pkl' @staticmethod def save_vsm(model, filename=None): joblib.dump(model, filename if filename else BowTransform.default_path) @staticmethod def load_vsm(filename=None): return joblib.load(filename if filename else BowTransform.default_path) class TrainData(object): default_path = './model/train_data.pkl' @staticmethod def save(model, filename=None): joblib.dump(model, filename if filename else TrainData.default_path) @staticmethod def load(filename=None): with open('./data/tags_token_results' + '_tag') as f: return joblib.load(filename if filename else TrainData.default_path), list(map(int, f.read().split('\n')[:-1])) ================================================ FILE: readme.md ================================================ ## 环境 ## - jieba - pip install jieba - sklearn - pip install -U scikit-learn - python3.5 ## 运行方法 - -c classfile_name 命令用来指定要用的分类器的名字: | **名字** | **对应的分类器** | | ------- | --------------------------- | | **p** | Perceptron | | **lr** | LogisticRegression | | **nb** | NaiveBayesian | | **svm** | SVM(sklearn) | | **lrs** | LogisticRegression(sklearn) | | **nbs** | NaiveBayesian(sklearn) | - -i filename 为指定输入的短信文件名(该文件一行为一条短信) - -o filename 为指定输出的结果文件(结果用0和1表示,1为垃圾短信,每一行对应输入文件的短信结果) 例如: ``` python judgeSpamMessage.py -c svm -i ./data/不带标签短信.txt -o ./data/result.txt ``` 上述的命令指定了使用svm分类器,判断./data/不带标签短信.txt中的短信是否为垃圾短信,并将结果输出到./data/result.txt中。 ## 训练啥的 - 首先运行token_and_save_to_file.py,分词保存结果 - test.py 中有交叉验证等方法 ## 文件说明 文件夹解释如下: | **文件夹名** | **作用** | | -------------- | ----------- | | **classifier** | 分类器代码存放的文件夹 | | **data** | 数据文件 | | **model** | 保存的模型 | 文件的解释如下: | **文件夹名** | **作用** | | -------------------------------- | ---------------------------------------- | | classifier/LogisticRegression.py | 本组实现的逻辑回归分类器源代码 | | classifier/NaiveBayesian.py | 本组实现的朴素贝叶斯分类器源代码 | | classifier/Perceptron.py | 本组实现的感知器分类器源代码 | | data/tags_token_results | 带标签短信分词保存结果,token_and_save_to_file.py的生成的 | | data/ tags_token_results_tag | 带标签短信的类别 | | data/不带标签短信.txt | 不带标签短信数据集 | | data/带标签短信.txt | 带标签短信数据集 | | model/ Bayes_sklearn.pkl | sklearn的贝叶斯分类器训练结果保存 | | model/ Logistic_sklearn.pkl | sklearn的逻辑回归分类器训练结果保存 | | model/ LogisticRegression.pkl | 本组实现的逻辑回归分类器训练结果保存 | | model/ NaiveBayesian.pkl | 本组实现的贝叶斯分类器训练结果保存 | | model/ Perceptron.pkl | 本组实现的感知器训练结果保存 | | model/ SVM_sklearn.pkl | sklearn的SVM分类器结果保存 | | model/ train_data.pkl | 带标签的短信的BOW表示结果 | | model/ vsm.pkl | 用于将新文档表示为BOW的训练完的类保存 | | judgeSpamMessage.py | 用于判断输入的短信是否是垃圾短信 | | model_manage.py | 用于读入保存模型 | | readme.md | 说明文件 | | test.py | 测试文件 | | token_and_save_to_file.py | 分词并保存带标签的短信的结果,方便训练 | ================================================ FILE: test.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/10/16 # @Author : hrwhisper import codecs from collections import Counter import datetime from sklearn.externals import joblib from sklearn.metrics import precision_recall_fscore_support from sklearn.model_selection import cross_val_score from sklearn.feature_extraction import DictVectorizer import numpy as np from sklearn import metrics, naive_bayes, svm, linear_model from classifier.LogisticRegression import LogisticRegression from classifier.NaiveBayesian import NaiveBayesian from classifier.Perceptron import Perceptron from model_manage import BowTransform, TrainData def read_train_data(): file_path = './data/tags_token_results' with codecs.open(file_path, 'r', 'utf-8') as f: data = [line.strip().split() for line in f.read().split('\n')] with open(file_path + '_tag') as f: return data[:-1], list(map(int, f.read().split('\n')[:-1])) def _test(classifier, test_data, test_target): predicted = classifier.predict(test_data) print(predicted.shape) # 160 1 # print(sum(predicted == test_target), len(test_target), np.mean(predicted == test_target)) print("Classification report for classifier %s:\n%s\n" % ( classifier, metrics.classification_report(test_target, predicted, digits=4))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_target, predicted)) print(precision_recall_fscore_support(test_target, predicted)) def test_one(cls, use_save_data=True, train_cls=False, save_cls_path=None): if use_save_data: data, target = TrainData.load() else: data, target = read_train_data() # data_len = int(len(data) * 0.001) # data, target = data[:data_len], target[:data_len] data = [Counter(d) for d in data] # 每一行为一个短信, 值就是TF # print(data[0]) v = DictVectorizer() print('fit transform') data = v.fit_transform(data) # 稀疏矩阵表示sparse matrix,词编好号 TrainData.save(data) # print(data[0]) data_len = data.shape[0] print('data', data.shape[1]) end = int(0.8 * data_len) train_data, train_target = data[:end], target[:end] test_data, test_target = data[end:], target[end:] if train_cls: print('train classifier....') cls = cls.fit(train_data, train_target) print('train classifier complete') _test(cls, test_data, test_target) if save_cls_path: joblib.dump(cls, save_cls_path) def cross_validation(): data, target = TrainData.load() classifiers = { 'Logistic by yhr': LogisticRegression(alpha=0.2, max_iter=2000), 'Perceptron by yhr': Perceptron(alpha=0.1, max_iter=2000), 'Bayesian by wqs': NaiveBayesian(), 'Bernoulli Bayes from sklearn': naive_bayes.BernoulliNB(), 'svm from sklearn': svm.LinearSVC(), 'Logistic from sklearn': linear_model.LogisticRegression(), # 'decision tree':tree.DecisionTreeClassifier(), } for name, classifier in classifiers.items(): this_scores = cross_val_score(classifier, data, target, cv=5, scoring='accuracy') print(name) print(this_scores) print(np.mean(this_scores)) print(' ------------------------ \n\n') def test_parameter(): data, target = TrainData.load() max_score = 0 max_alpha = max_iter = 0 print('Perceptron') start = datetime.datetime.now() for alpha in [0.01, 0.1, 0.2]: # 0.3 100 0.99133 for iter in [100, 2000]: cls = Perceptron(alpha=alpha, max_iter=iter) this_scores = cross_val_score(cls, data, target, cv=5, scoring='accuracy') print(this_scores) cur = np.mean(this_scores) print(alpha, iter, cur) print(' ------------------------ \n\n') if cur > max_score: max_score = cur max_alpha, max_iter = alpha, iter print('current_max: ', max_score, max_alpha, max_iter) print((datetime.datetime.now() - start)) if __name__ == '__main__': # start = datetime.datetime.now() # test_one(LogisticRegression(alpha=0.1, max_iter=2000), # train_cls=True, save_cls_path='./model/LogisticRegression.pkl') # print((datetime.datetime.now() - start)) # # classifiers = { # # 'Logistic by yhr': LogisticRegression(alpha=0.01, max_iter=200), # # 'Perceptron by yhr': Perceptron(), # # 'Bayesian by wqs': NaiveBayesian(), # # 'Bayes_sklearn': naive_bayes.BernoulliNB(), # # 'SVM_sklearn': svm.LinearSVC(), # # 'Logistic_sklearn': linear_model.LogisticRegression(), # # 'decision tree':tree.DecisionTreeClassifier(), # } # for name, cls in classifiers.items(): # test_one(cls, train_cls=True, save_cls_path='./model/' + name + '.pkl') # # test_one(LogisticRegression(alpha=0.2, max_iter=2000), train_cls=True, # save_cls_path='./model/LogisticRegression.pkl') # test_one(Perceptron(alpha=0.1, max_iter=2000), train_cls=True, save_cls_path='./model/Perceptron.pkl') # classifiers = { 'p': './model/Perceptron.pkl', # 0.1 2000 'lr': './model/LogisticRegression.pkl', # 0.2 2000 'nb': './model/NaiveBayesian.pkl', # 0.00241 'svm': './model/SVM_sklearn.pkl', 'lrs': './model/Logistic_sklearn.pkl', 'nbs': './model/Bayes_sklearn.pkl' } for _path in classifiers.values(): cls = joblib.load(_path) test_one(cls) # cross_validation() # test_one(LogisticRegression(max_iter=100), train_cls=True) # test_parameter() ================================================ FILE: test_jieba.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/12/5 # @Author : hrwhisper import codecs import datetime from collections import Counter import jieba from multiprocessing import Pool def test_not_tag_data(): with codecs.open('./data/不带标签短信.txt', 'r', 'utf-8') as f: data = [line.strip() for line in f.read().split('\n')] if data[-1] == '': data.pop() return data def f(x): return Counter(jieba.cut(x)) if __name__ == "__main__": # jieba.enable_parallel(2) start = datetime.datetime.now() data = test_not_tag_data() print('read data', datetime.datetime.now() - start) # data = [Counter(d) for d in map(jieba.cut, data)] res = Pool(4).map(f, data) # print(res) print('jieba ', datetime.datetime.now() - start) # cv = BowTransform.load_vsm() # data = cv.transform(data) # print('transform', datetime.datetime.now() - start) # cls = joblib.load(classifiers[cls_name]) # print('load', datetime.datetime.now() - start) # predicted = cls.predict(data) # # print(predicted) # with open('./data/result.txt', 'w+') as f: # for x in predicted: # f.write(str(x) + '\n') # print(datetime.datetime.now() - start, ' %s' % cls) ================================================ FILE: test_judge.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/12/3 # @Author : hrwhisper import codecs import sys from collections import Counter from multiprocessing import Pool from optparse import OptionParser import jieba from sklearn import metrics from sklearn.externals import joblib from sklearn.feature_extraction import DictVectorizer from sklearn.metrics import precision_recall_fscore_support from model_manage import BowTransform from test_jieba import test_not_tag_data def test_data(): def read_train_data(): file_path = './data/带标签短信.txt' target = [] data = [] with codecs.open(file_path, 'r', 'utf-8') as f: for line in f.read().split('\n')[:-1]: line = line.strip() target.append(line[0]) data.append(line[1:].lstrip()) return data, target data, target = read_train_data() n = len(data) - int(0.8 * len(data)) return data[-n:], list(map(int, target[-n:])) def f(x): return Counter(jieba.cut(x)) if __name__ == "__main__": classifiers = { 'p': './model/Perceptron.pkl', # 0.1 2000 'lr': './model/LogisticRegression.pkl', # 0.2 2000 'nb': './model/NaiveBayesian.pkl', # 0.00241 'svm': './model/SVM_sklearn.pkl', 'lrs': './model/Logistic_sklearn.pkl', 'nbs': './model/Bayes_sklearn.pkl' } import time # for cls_name in classifiers.keys(): # jieba.initialize() # # start = time.time() # data = test_not_tag_data() # cls = joblib.load(classifiers[cls_name]) # # data = Pool().map(f, data) # # data = [Counter(d) for d in map(jieba.cut, data)] # print('end jieba', time.time() - start) # cv = BowTransform.load_vsm() # data = cv.transform(data) # predicted = cls.predict(data) # with open('./data/result.txt', 'w+') as f: # for x in predicted: # f.write(str(x) + '\n') # print('end %s with time:' % cls, time.time() - start) start = time.time() data, target = test_data() # data = [Counter(d) for d in map(jieba.cut, data)] data = Pool(4).map(f, data) cv = BowTransform.load_vsm() data = cv.transform(data) for cls_name in classifiers.keys(): cls = joblib.load(classifiers[cls_name]) predicted = cls.predict(data) print(predicted.shape) # 160 1 # print(sum(predicted == test_target), len(test_target), np.mean(predicted == test_target)) print("Classification report for classifier %s:\n%s\n" % ( cls, metrics.classification_report(target, predicted, digits=4))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(target, predicted)) print(precision_recall_fscore_support(target, predicted)) print('end', time.time() - start) ================================================ FILE: test_judge2.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/12/3 # @Author : hrwhisper import codecs import os import sys from collections import Counter from multiprocessing import Pool from optparse import OptionParser import jieba from sklearn import metrics from sklearn.externals import joblib from sklearn.metrics import precision_recall_fscore_support from model_manage import BowTransform def test_not_tag_data(): with codecs.open('./data/不带标签短信.txt', 'r', 'utf-8') as f: data = f.read() return data # if data[-1] == '': # data.pop() # return data def f(x): return Counter(jieba.cut(x)) if __name__ == "__main__": # classifiers = { 'p': './model/Perceptron.pkl', # 0.1 2000 'lr': './model/LogisticRegression.pkl', # 0.2 2000 'nb': './model/NaiveBayesian.pkl', # 0.00241 'svm': './model/SVM_sklearn.pkl', 'lrs': './model/Logistic_sklearn.pkl', 'nbs': './model/Bayes_sklearn.pkl' } import time if os.name != 'nt': print('on linux enable parallel tantalization') jieba.enable_parallel(4) for cls_name in classifiers.keys(): start = time.time() data = test_not_tag_data() cls = joblib.load(classifiers[cls_name]) # data = Pool(4).map(f, data) data = [Counter(x) for x in ' '.join(jieba.cut(data)).split('\n')] # with codecs.open('./data/result.txt', 'w+', 'utf-8') as f: # f.write() print('end jieba', time.time() - start) cv = BowTransform.load_vsm() data = cv.transform(data) predicted = cls.predict(data) with open('./data/result.txt', 'w+') as f: for x in predicted: f.write(str(x) + '\n') print('end %s with time:' % cls, time.time() - start) # data, target = test_data() # # data = [Counter(d) for d in map(jieba.cut, data)] # data = Pool(4).map(f, data) # cv = BowTransform.load_vsm() # data = cv.transform(data) # for cls_name in classifiers.keys(): # cls = joblib.load(classifiers[cls_name]) # predicted = cls.predict(data) # # print(predicted.shape) # 160 1 # # # print(sum(predicted == test_target), len(test_target), np.mean(predicted == test_target)) # print("Classification report for classifier %s:\n%s\n" % ( # cls, metrics.classification_report(target, predicted, digits=4))) # print("Confusion matrix:\n%s" % metrics.confusion_matrix(target, predicted)) # print(precision_recall_fscore_support(target, predicted)) # print('end', time.time() - start) ================================================ FILE: token_and_save_to_file.py ================================================ # -*- coding: utf-8 -*- # @Date : 2016/10/7 # @Author : hrwhisper from collections import Counter from multiprocessing.pool import Pool import jieba import codecs from sklearn.feature_extraction import DictVectorizer from model_manage import TrainData def read_train_data(): file_path = './data/带标签短信.txt' target = [] data = [] with codecs.open(file_path, 'r', 'utf-8') as f: for line in f.read().split('\n')[:-1]: line = line.strip() target.append(line[0]) data.append(line[1:].lstrip()) return data, target def save_tokenlization_result(data, target, file_path='./data/tags_token_results'): with codecs.open(file_path, 'w', 'utf-8') as f: for x in data: f.write(' '.join(x) + '\n') with open(file_path + '_tag', 'w') as f: for x in target: f.write(x + '\n') if __name__ == '__main__': # seg_list = jieba.cut("我来到北京清华大学", cut_all=False) # print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 data, target = read_train_data() data = Pool().map(jieba.lcut, data) # data = jieba.lcut(data) save_tokenlization_result(data, target) with codecs.open('./data/tags_token_results', 'r', 'utf-8') as f: data = [line.strip().split() for line in f.read().split('\n')] if not data[-1]: data.pop() t = [Counter(d) for d in data] # 每一行为一个短信, 值就是TF v = DictVectorizer() t = v.fit_transform(t) # 稀疏矩阵表示sparse matrix,词编好号 TrainData.save(t)