Repository: hrwhisper/SpamMessage
Branch: master
Commit: 754d3a74c626
Files: 23
Total size: 80.6 MB
Directory structure:
gitextract_y8xnqj4p/
├── .gitignore
├── classifier/
│ ├── LogisticRegression.py
│ ├── NaiveBayesian.py
│ ├── Perceptron.py
│ └── __init__.py
├── data/
│ ├── 不带标签短信.txt
│ └── 带标签短信.txt
├── judgeSpamMessage.py
├── model/
│ ├── Bayes_sklearn.pkl
│ ├── LogisticRegression.pkl
│ ├── Logistic_sklearn.pkl
│ ├── NaiveBayesian.pkl
│ ├── Perceptron.pkl
│ ├── SVM_sklearn.pkl
│ ├── __init__.py
│ └── vsm.pkl
├── model_manage.py
├── readme.md
├── test.py
├── test_jieba.py
├── test_judge.py
├── test_judge2.py
└── token_and_save_to_file.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
.idea
*.pyc
*~
*.swp
================================================
FILE: classifier/LogisticRegression.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/10/16
# @Author : hrwhisper
import random
import sklearn
import numpy as np
from sklearn.utils.extmath import safe_sparse_dot
class LogisticRegression(sklearn.base.BaseEstimator):
def __init__(self, alpha=0.1, max_iter=100): # 0.02 200 0.9927425
self.alpha = alpha
self.max_iter = max_iter
self.weights = None
def _sigmoid(self, x):
return .5 * (1 + np.tanh(.5 * x)) # 1.0 / (1 + np.exp(-x))
def fit(self, X, y):
"""
:param X: sparse matrix(n_samples,n_features) the training feature vector
:param y: array-like(n_samples,) Target vector relative to X.
:return: self
"""
m, n = X.shape
target = np.array(y).reshape((m, 1))
self.weights = np.ones((n, 1))
for _ in range(self.max_iter):
h = self._sigmoid(safe_sparse_dot(X, self.weights)) # X * weights
error = target - h
self.weights += self.alpha * safe_sparse_dot(X.T, error) # alpha * X.T * error
return self
def predict(self, X):
return (self._sigmoid(safe_sparse_dot(X, self.weights)) > 0.5).ravel().astype('int')
if __name__ == '__main__':
a = [1, 2, 3]
b = [4, 5, 6]
a = np.array(a)
b = np.array(b)
print(a, b)
a = a.reshape((-1, 1))
print(a)
print(a - b)
================================================
FILE: classifier/NaiveBayesian.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2016/11/29 22:04
# @Author : wqs
# @File : NaiveBayesian.py
import sklearn
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.preprocessing import LabelBinarizer
class NaiveBayesian(sklearn.base.BaseEstimator):
def __init__(self, alpha=1.0):
self.alpha = alpha
self.classes = None
self.class_log_prior = None
self.feature_log_prob = None
def fit(self, X, y):
_, n = X.shape
labelbin = LabelBinarizer()
Y = labelbin.fit_transform(y)
self.classes = labelbin.classes_
Y = np.concatenate((1 - Y, Y), axis=1).astype(np.float64)
class_count = np.zeros(2, dtype=np.float64)
feature_count = np.zeros((2, n), dtype=np.float64)
feature_count += safe_sparse_dot(Y.T, X) # count frequency by y.T * X
class_count += Y.sum(axis=0)
smoothed_fc = feature_count + self.alpha
smoothed_cc = smoothed_fc.sum(axis=1)
self.feature_log_prob = (np.log(smoothed_fc) - np.log(smoothed_cc.reshape(-1, 1)))
# self.class_log_prior = np.zeros(2) - np.log(2)
self.class_log_prior = np.log(class_count / sum(class_count))
return self
def predict(self, X):
jll = safe_sparse_dot(X, self.feature_log_prob.T) + self.class_log_prior
return self.classes[np.argmax(jll, axis=1)]
================================================
FILE: classifier/Perceptron.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/11/1
# @Author : hrwhisper
import numpy as np
import sklearn
from sklearn.utils.extmath import safe_sparse_dot
class Perceptron(sklearn.base.BaseEstimator):
def __init__(self, alpha=0.1, max_iter=100):
self.threshold = 0.5
self.alpha = alpha
self.max_iter = max_iter
self.weights = None
def fit(self, X, y):
"""
:param X: sparse matrix(n_samples,n_features) the training feature vector
:param y: array-like(n_samples,) Target vector relative to X.
:return: self
"""
m, n = X.shape
target = np.array(y).reshape((m, 1))
self.weights = np.ones((n, 1))
for _ in range(self.max_iter):
h = safe_sparse_dot(X, self.weights) > self.threshold
error = target - h
self.weights += self.alpha * safe_sparse_dot(X.T, error)
return self
def predict(self, X):
return (safe_sparse_dot(X, self.weights) > self.threshold).ravel().astype('int')
================================================
FILE: classifier/__init__.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/10/16
# @Author : hrwhisper
================================================
FILE: data/不带标签短信.txt
================================================
[File too large to display: 13.5 MB]
================================================
FILE: data/带标签短信.txt
================================================
[File too large to display: 55.5 MB]
================================================
FILE: judgeSpamMessage.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/12/3
# @Author : hrwhisper
import codecs
import sys
from collections import Counter
from optparse import OptionParser
from multiprocessing import Pool
import jieba
from sklearn.externals import joblib
from model_manage import BowTransform
import time
def token(x):
return Counter(jieba.lcut(x))
if __name__ == "__main__":
parser = OptionParser()
parser.add_option('-c', '--classifier', dest="cls_name", type='string', default='p',
help="define the classifier you want to use: \t\t\n"
"p => Perceptron,\t\t\t\t\t\n"
"lr => LogisticRegression,\t\t\t\t\t\n"
"nb => NaiveBayesian,\t\t\t\t\t\n"
"svm => SVM(sklearn),\t\t\t\t\t\n"
"lrs => LogisticRegression(sklearn),\t\t\t\t\t\n"
"nbs => NaiveBayesian(sklearn),")
parser.add_option('-i', '--input', dest="input_filename", type='string', default='./data/不带标签短信.txt',
help="input file name")
parser.add_option('-o', '--output', dest="output_filename", type='string', default='./data/result.txt',
help="output file name")
options, args = parser.parse_args()
#
classifiers = {
'p': './model/Perceptron.pkl', # 0.1 2000
'lr': './model/LogisticRegression.pkl', # 0.2 2000
'nb': './model/NaiveBayesian.pkl', # 0.00241
'svm': './model/SVM_sklearn.pkl',
'lrs': './model/Logistic_sklearn.pkl',
'nbs': './model/Bayes_sklearn.pkl'
}
#
cls_name = options.cls_name
file_path = options.input_filename
out_path = options.output_filename
if cls_name not in classifiers.keys():
print('check your classifiers name, you can use -h for help')
sys.exit()
start = time.time()
jieba.initialize()
try:
with codecs.open(file_path, 'r', 'utf-8') as f:
data = [line.strip() for line in f.read().split('\n')]
if data[-1] == '':
data.pop()
except FileNotFoundError as e:
print('Please check your input filename')
sys.exit()
# data = [Counter(d) for d in map(jieba.cut, data)]
data = Pool().map(token, data)
print('end token in {}\n'.format(time.time() - start))
cv = BowTransform.load_vsm()
data = cv.transform(data)
print('end bow in {}\n'.format(time.time() - start))
cls = joblib.load(classifiers[cls_name])
predicted = cls.predict(data)
# print(predicted)
with open(out_path, 'w+') as f:
for x in predicted:
f.write(str(x) + '\n')
print('task complete. total time: {}\n using {}'.format(time.time() - start, cls))
================================================
FILE: model/Bayes_sklearn.pkl
================================================
[File too large to display: 11.6 MB]
================================================
FILE: model/__init__.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/11/30
# @Author : hrwhisper
================================================
FILE: model_manage.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/12/1
# @Author : hrwhisper
from sklearn.externals import joblib
class BowTransform(object):
default_path = './model/vsm.pkl'
@staticmethod
def save_vsm(model, filename=None):
joblib.dump(model, filename if filename else BowTransform.default_path)
@staticmethod
def load_vsm(filename=None):
return joblib.load(filename if filename else BowTransform.default_path)
class TrainData(object):
default_path = './model/train_data.pkl'
@staticmethod
def save(model, filename=None):
joblib.dump(model, filename if filename else TrainData.default_path)
@staticmethod
def load(filename=None):
with open('./data/tags_token_results' + '_tag') as f:
return joblib.load(filename if filename else TrainData.default_path), list(map(int, f.read().split('\n')[:-1]))
================================================
FILE: readme.md
================================================
## 环境 ##
- jieba
- pip install jieba
- sklearn
- pip install -U scikit-learn
- python3.5
## 运行方法
- -c classfile_name 命令用来指定要用的分类器的名字:
| **名字** | **对应的分类器** |
| ------- | --------------------------- |
| **p** | Perceptron |
| **lr** | LogisticRegression |
| **nb** | NaiveBayesian |
| **svm** | SVM(sklearn) |
| **lrs** | LogisticRegression(sklearn) |
| **nbs** | NaiveBayesian(sklearn) |
- -i filename 为指定输入的短信文件名(该文件一行为一条短信)
- -o filename 为指定输出的结果文件(结果用0和1表示,1为垃圾短信,每一行对应输入文件的短信结果)
例如:
```
python judgeSpamMessage.py -c svm -i ./data/不带标签短信.txt -o ./data/result.txt
```
上述的命令指定了使用svm分类器,判断./data/不带标签短信.txt中的短信是否为垃圾短信,并将结果输出到./data/result.txt中。
## 训练啥的
- 首先运行token_and_save_to_file.py,分词保存结果
- test.py 中有交叉验证等方法
## 文件说明
文件夹解释如下:
| **文件夹名** | **作用** |
| -------------- | ----------- |
| **classifier** | 分类器代码存放的文件夹 |
| **data** | 数据文件 |
| **model** | 保存的模型 |
文件的解释如下:
| **文件夹名** | **作用** |
| -------------------------------- | ---------------------------------------- |
| classifier/LogisticRegression.py | 本组实现的逻辑回归分类器源代码 |
| classifier/NaiveBayesian.py | 本组实现的朴素贝叶斯分类器源代码 |
| classifier/Perceptron.py | 本组实现的感知器分类器源代码 |
| data/tags_token_results | 带标签短信分词保存结果,token_and_save_to_file.py的生成的 |
| data/ tags_token_results_tag | 带标签短信的类别 |
| data/不带标签短信.txt | 不带标签短信数据集 |
| data/带标签短信.txt | 带标签短信数据集 |
| model/ Bayes_sklearn.pkl | sklearn的贝叶斯分类器训练结果保存 |
| model/ Logistic_sklearn.pkl | sklearn的逻辑回归分类器训练结果保存 |
| model/ LogisticRegression.pkl | 本组实现的逻辑回归分类器训练结果保存 |
| model/ NaiveBayesian.pkl | 本组实现的贝叶斯分类器训练结果保存 |
| model/ Perceptron.pkl | 本组实现的感知器训练结果保存 |
| model/ SVM_sklearn.pkl | sklearn的SVM分类器结果保存 |
| model/ train_data.pkl | 带标签的短信的BOW表示结果 |
| model/ vsm.pkl | 用于将新文档表示为BOW的训练完的类保存 |
| judgeSpamMessage.py | 用于判断输入的短信是否是垃圾短信 |
| model_manage.py | 用于读入保存模型 |
| readme.md | 说明文件 |
| test.py | 测试文件 |
| token_and_save_to_file.py | 分词并保存带标签的短信的结果,方便训练 |
================================================
FILE: test.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/10/16
# @Author : hrwhisper
import codecs
from collections import Counter
import datetime
from sklearn.externals import joblib
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer
import numpy as np
from sklearn import metrics, naive_bayes, svm, linear_model
from classifier.LogisticRegression import LogisticRegression
from classifier.NaiveBayesian import NaiveBayesian
from classifier.Perceptron import Perceptron
from model_manage import BowTransform, TrainData
def read_train_data():
file_path = './data/tags_token_results'
with codecs.open(file_path, 'r', 'utf-8') as f:
data = [line.strip().split() for line in f.read().split('\n')]
with open(file_path + '_tag') as f:
return data[:-1], list(map(int, f.read().split('\n')[:-1]))
def _test(classifier, test_data, test_target):
predicted = classifier.predict(test_data)
print(predicted.shape) # 160 1
# print(sum(predicted == test_target), len(test_target), np.mean(predicted == test_target))
print("Classification report for classifier %s:\n%s\n" % (
classifier, metrics.classification_report(test_target, predicted, digits=4)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_target, predicted))
print(precision_recall_fscore_support(test_target, predicted))
def test_one(cls, use_save_data=True, train_cls=False, save_cls_path=None):
if use_save_data:
data, target = TrainData.load()
else:
data, target = read_train_data()
# data_len = int(len(data) * 0.001)
# data, target = data[:data_len], target[:data_len]
data = [Counter(d) for d in data] # 每一行为一个短信, 值就是TF
# print(data[0])
v = DictVectorizer()
print('fit transform')
data = v.fit_transform(data) # 稀疏矩阵表示sparse matrix,词编好号
TrainData.save(data)
# print(data[0])
data_len = data.shape[0]
print('data', data.shape[1])
end = int(0.8 * data_len)
train_data, train_target = data[:end], target[:end]
test_data, test_target = data[end:], target[end:]
if train_cls:
print('train classifier....')
cls = cls.fit(train_data, train_target)
print('train classifier complete')
_test(cls, test_data, test_target)
if save_cls_path:
joblib.dump(cls, save_cls_path)
def cross_validation():
data, target = TrainData.load()
classifiers = {
'Logistic by yhr': LogisticRegression(alpha=0.2, max_iter=2000),
'Perceptron by yhr': Perceptron(alpha=0.1, max_iter=2000),
'Bayesian by wqs': NaiveBayesian(),
'Bernoulli Bayes from sklearn': naive_bayes.BernoulliNB(),
'svm from sklearn': svm.LinearSVC(),
'Logistic from sklearn': linear_model.LogisticRegression(),
# 'decision tree':tree.DecisionTreeClassifier(),
}
for name, classifier in classifiers.items():
this_scores = cross_val_score(classifier, data, target, cv=5, scoring='accuracy')
print(name)
print(this_scores)
print(np.mean(this_scores))
print(' ------------------------ \n\n')
def test_parameter():
data, target = TrainData.load()
max_score = 0
max_alpha = max_iter = 0
print('Perceptron')
start = datetime.datetime.now()
for alpha in [0.01, 0.1, 0.2]: # 0.3 100 0.99133
for iter in [100, 2000]:
cls = Perceptron(alpha=alpha, max_iter=iter)
this_scores = cross_val_score(cls, data, target, cv=5, scoring='accuracy')
print(this_scores)
cur = np.mean(this_scores)
print(alpha, iter, cur)
print(' ------------------------ \n\n')
if cur > max_score:
max_score = cur
max_alpha, max_iter = alpha, iter
print('current_max: ', max_score, max_alpha, max_iter)
print((datetime.datetime.now() - start))
if __name__ == '__main__':
# start = datetime.datetime.now()
# test_one(LogisticRegression(alpha=0.1, max_iter=2000),
# train_cls=True, save_cls_path='./model/LogisticRegression.pkl')
# print((datetime.datetime.now() - start))
#
# classifiers = {
# # 'Logistic by yhr': LogisticRegression(alpha=0.01, max_iter=200),
# # 'Perceptron by yhr': Perceptron(),
# # 'Bayesian by wqs': NaiveBayesian(),
# # 'Bayes_sklearn': naive_bayes.BernoulliNB(),
# # 'SVM_sklearn': svm.LinearSVC(),
# # 'Logistic_sklearn': linear_model.LogisticRegression(),
# # 'decision tree':tree.DecisionTreeClassifier(),
# }
# for name, cls in classifiers.items():
# test_one(cls, train_cls=True, save_cls_path='./model/' + name + '.pkl')
#
# test_one(LogisticRegression(alpha=0.2, max_iter=2000), train_cls=True,
# save_cls_path='./model/LogisticRegression.pkl')
# test_one(Perceptron(alpha=0.1, max_iter=2000), train_cls=True, save_cls_path='./model/Perceptron.pkl')
#
classifiers = {
'p': './model/Perceptron.pkl', # 0.1 2000
'lr': './model/LogisticRegression.pkl', # 0.2 2000
'nb': './model/NaiveBayesian.pkl', # 0.00241
'svm': './model/SVM_sklearn.pkl',
'lrs': './model/Logistic_sklearn.pkl',
'nbs': './model/Bayes_sklearn.pkl'
}
for _path in classifiers.values():
cls = joblib.load(_path)
test_one(cls)
# cross_validation()
# test_one(LogisticRegression(max_iter=100), train_cls=True)
# test_parameter()
================================================
FILE: test_jieba.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/12/5
# @Author : hrwhisper
import codecs
import datetime
from collections import Counter
import jieba
from multiprocessing import Pool
def test_not_tag_data():
with codecs.open('./data/不带标签短信.txt', 'r', 'utf-8') as f:
data = [line.strip() for line in f.read().split('\n')]
if data[-1] == '':
data.pop()
return data
def f(x):
return Counter(jieba.cut(x))
if __name__ == "__main__":
# jieba.enable_parallel(2)
start = datetime.datetime.now()
data = test_not_tag_data()
print('read data', datetime.datetime.now() - start)
# data = [Counter(d) for d in map(jieba.cut, data)]
res = Pool(4).map(f, data)
# print(res)
print('jieba ', datetime.datetime.now() - start)
# cv = BowTransform.load_vsm()
# data = cv.transform(data)
# print('transform', datetime.datetime.now() - start)
# cls = joblib.load(classifiers[cls_name])
# print('load', datetime.datetime.now() - start)
# predicted = cls.predict(data)
#
# print(predicted)
# with open('./data/result.txt', 'w+') as f:
# for x in predicted:
# f.write(str(x) + '\n')
# print(datetime.datetime.now() - start, ' %s' % cls)
================================================
FILE: test_judge.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/12/3
# @Author : hrwhisper
import codecs
import sys
from collections import Counter
from multiprocessing import Pool
from optparse import OptionParser
import jieba
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import precision_recall_fscore_support
from model_manage import BowTransform
from test_jieba import test_not_tag_data
def test_data():
def read_train_data():
file_path = './data/带标签短信.txt'
target = []
data = []
with codecs.open(file_path, 'r', 'utf-8') as f:
for line in f.read().split('\n')[:-1]:
line = line.strip()
target.append(line[0])
data.append(line[1:].lstrip())
return data, target
data, target = read_train_data()
n = len(data) - int(0.8 * len(data))
return data[-n:], list(map(int, target[-n:]))
def f(x):
return Counter(jieba.cut(x))
if __name__ == "__main__":
classifiers = {
'p': './model/Perceptron.pkl', # 0.1 2000
'lr': './model/LogisticRegression.pkl', # 0.2 2000
'nb': './model/NaiveBayesian.pkl', # 0.00241
'svm': './model/SVM_sklearn.pkl',
'lrs': './model/Logistic_sklearn.pkl',
'nbs': './model/Bayes_sklearn.pkl'
}
import time
# for cls_name in classifiers.keys():
# jieba.initialize()
#
# start = time.time()
# data = test_not_tag_data()
# cls = joblib.load(classifiers[cls_name])
#
# data = Pool().map(f, data)
# # data = [Counter(d) for d in map(jieba.cut, data)]
# print('end jieba', time.time() - start)
# cv = BowTransform.load_vsm()
# data = cv.transform(data)
# predicted = cls.predict(data)
# with open('./data/result.txt', 'w+') as f:
# for x in predicted:
# f.write(str(x) + '\n')
# print('end %s with time:' % cls, time.time() - start)
start = time.time()
data, target = test_data()
# data = [Counter(d) for d in map(jieba.cut, data)]
data = Pool(4).map(f, data)
cv = BowTransform.load_vsm()
data = cv.transform(data)
for cls_name in classifiers.keys():
cls = joblib.load(classifiers[cls_name])
predicted = cls.predict(data)
print(predicted.shape) # 160 1
# print(sum(predicted == test_target), len(test_target), np.mean(predicted == test_target))
print("Classification report for classifier %s:\n%s\n" % (
cls, metrics.classification_report(target, predicted, digits=4)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(target, predicted))
print(precision_recall_fscore_support(target, predicted))
print('end', time.time() - start)
================================================
FILE: test_judge2.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/12/3
# @Author : hrwhisper
import codecs
import os
import sys
from collections import Counter
from multiprocessing import Pool
from optparse import OptionParser
import jieba
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.metrics import precision_recall_fscore_support
from model_manage import BowTransform
def test_not_tag_data():
with codecs.open('./data/不带标签短信.txt', 'r', 'utf-8') as f:
data = f.read()
return data
# if data[-1] == '':
# data.pop()
# return data
def f(x):
return Counter(jieba.cut(x))
if __name__ == "__main__":
#
classifiers = {
'p': './model/Perceptron.pkl', # 0.1 2000
'lr': './model/LogisticRegression.pkl', # 0.2 2000
'nb': './model/NaiveBayesian.pkl', # 0.00241
'svm': './model/SVM_sklearn.pkl',
'lrs': './model/Logistic_sklearn.pkl',
'nbs': './model/Bayes_sklearn.pkl'
}
import time
if os.name != 'nt':
print('on linux enable parallel tantalization')
jieba.enable_parallel(4)
for cls_name in classifiers.keys():
start = time.time()
data = test_not_tag_data()
cls = joblib.load(classifiers[cls_name])
# data = Pool(4).map(f, data)
data = [Counter(x) for x in ' '.join(jieba.cut(data)).split('\n')]
# with codecs.open('./data/result.txt', 'w+', 'utf-8') as f:
# f.write()
print('end jieba', time.time() - start)
cv = BowTransform.load_vsm()
data = cv.transform(data)
predicted = cls.predict(data)
with open('./data/result.txt', 'w+') as f:
for x in predicted:
f.write(str(x) + '\n')
print('end %s with time:' % cls, time.time() - start)
# data, target = test_data()
# # data = [Counter(d) for d in map(jieba.cut, data)]
# data = Pool(4).map(f, data)
# cv = BowTransform.load_vsm()
# data = cv.transform(data)
# for cls_name in classifiers.keys():
# cls = joblib.load(classifiers[cls_name])
# predicted = cls.predict(data)
#
# print(predicted.shape) # 160 1
#
# # print(sum(predicted == test_target), len(test_target), np.mean(predicted == test_target))
# print("Classification report for classifier %s:\n%s\n" % (
# cls, metrics.classification_report(target, predicted, digits=4)))
# print("Confusion matrix:\n%s" % metrics.confusion_matrix(target, predicted))
# print(precision_recall_fscore_support(target, predicted))
# print('end', time.time() - start)
================================================
FILE: token_and_save_to_file.py
================================================
# -*- coding: utf-8 -*-
# @Date : 2016/10/7
# @Author : hrwhisper
from collections import Counter
from multiprocessing.pool import Pool
import jieba
import codecs
from sklearn.feature_extraction import DictVectorizer
from model_manage import TrainData
def read_train_data():
file_path = './data/带标签短信.txt'
target = []
data = []
with codecs.open(file_path, 'r', 'utf-8') as f:
for line in f.read().split('\n')[:-1]:
line = line.strip()
target.append(line[0])
data.append(line[1:].lstrip())
return data, target
def save_tokenlization_result(data, target, file_path='./data/tags_token_results'):
with codecs.open(file_path, 'w', 'utf-8') as f:
for x in data:
f.write(' '.join(x) + '\n')
with open(file_path + '_tag', 'w') as f:
for x in target:
f.write(x + '\n')
if __name__ == '__main__':
# seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
# print("Default Mode: " + "/ ".join(seg_list)) # 精确模式
data, target = read_train_data()
data = Pool().map(jieba.lcut, data)
# data = jieba.lcut(data)
save_tokenlization_result(data, target)
with codecs.open('./data/tags_token_results', 'r', 'utf-8') as f:
data = [line.strip().split() for line in f.read().split('\n')]
if not data[-1]: data.pop()
t = [Counter(d) for d in data] # 每一行为一个短信, 值就是TF
v = DictVectorizer()
t = v.fit_transform(t) # 稀疏矩阵表示sparse matrix,词编好号
TrainData.save(t)
gitextract_y8xnqj4p/ ├── .gitignore ├── classifier/ │ ├── LogisticRegression.py │ ├── NaiveBayesian.py │ ├── Perceptron.py │ └── __init__.py ├── data/ │ ├── 不带标签短信.txt │ └── 带标签短信.txt ├── judgeSpamMessage.py ├── model/ │ ├── Bayes_sklearn.pkl │ ├── LogisticRegression.pkl │ ├── Logistic_sklearn.pkl │ ├── NaiveBayesian.pkl │ ├── Perceptron.pkl │ ├── SVM_sklearn.pkl │ ├── __init__.py │ └── vsm.pkl ├── model_manage.py ├── readme.md ├── test.py ├── test_jieba.py ├── test_judge.py ├── test_judge2.py └── token_and_save_to_file.py
SYMBOL INDEX (33 symbols across 10 files)
FILE: classifier/LogisticRegression.py
class LogisticRegression (line 10) | class LogisticRegression(sklearn.base.BaseEstimator):
method __init__ (line 11) | def __init__(self, alpha=0.1, max_iter=100): # 0.02 200 0.9927425
method _sigmoid (line 16) | def _sigmoid(self, x):
method fit (line 19) | def fit(self, X, y):
method predict (line 34) | def predict(self, X):
FILE: classifier/NaiveBayesian.py
class NaiveBayesian (line 14) | class NaiveBayesian(sklearn.base.BaseEstimator):
method __init__ (line 15) | def __init__(self, alpha=1.0):
method fit (line 21) | def fit(self, X, y):
method predict (line 41) | def predict(self, X):
FILE: classifier/Perceptron.py
class Perceptron (line 10) | class Perceptron(sklearn.base.BaseEstimator):
method __init__ (line 11) | def __init__(self, alpha=0.1, max_iter=100):
method fit (line 17) | def fit(self, X, y):
method predict (line 32) | def predict(self, X):
FILE: judgeSpamMessage.py
function token (line 15) | def token(x):
FILE: model_manage.py
class BowTransform (line 7) | class BowTransform(object):
method save_vsm (line 11) | def save_vsm(model, filename=None):
method load_vsm (line 15) | def load_vsm(filename=None):
class TrainData (line 19) | class TrainData(object):
method save (line 23) | def save(model, filename=None):
method load (line 27) | def load(filename=None):
FILE: test.py
function read_train_data (line 20) | def read_train_data():
function _test (line 29) | def _test(classifier, test_data, test_target):
function test_one (line 40) | def test_one(cls, use_save_data=True, train_cls=False, save_cls_path=None):
function cross_validation (line 73) | def cross_validation():
function test_parameter (line 94) | def test_parameter():
FILE: test_jieba.py
function test_not_tag_data (line 11) | def test_not_tag_data():
function f (line 20) | def f(x):
FILE: test_judge.py
function test_data (line 18) | def test_data():
function f (line 35) | def f(x):
FILE: test_judge2.py
function test_not_tag_data (line 17) | def test_not_tag_data():
function f (line 26) | def f(x):
FILE: token_and_save_to_file.py
function read_train_data (line 14) | def read_train_data():
function save_tokenlization_result (line 26) | def save_tokenlization_result(data, target, file_path='./data/tags_token...
Condensed preview — 23 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (27K chars).
[
{
"path": ".gitignore",
"chars": 20,
"preview": ".idea\n*.pyc\n*~\n*.swp"
},
{
"path": "classifier/LogisticRegression.py",
"chars": 1371,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/10/16\n# @Author : hrwhisper\nimport random\nimport sklearn\nimport numpy as np\nf"
},
{
"path": "classifier/NaiveBayesian.py",
"chars": 1460,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n# @Time : 2016/11/29 22:04\n# @Author : wqs\n# @File : NaiveBayesian."
},
{
"path": "classifier/Perceptron.py",
"chars": 1054,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/11/1\n# @Author : hrwhisper\n\nimport numpy as np\nimport sklearn\nfrom sklearn.ut"
},
{
"path": "classifier/__init__.py",
"chars": 70,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/10/16\n# @Author : hrwhisper"
},
{
"path": "judgeSpamMessage.py",
"chars": 2784,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/12/3\n# @Author : hrwhisper\nimport codecs\nimport sys\nfrom collections import C"
},
{
"path": "model/__init__.py",
"chars": 70,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/11/30\n# @Author : hrwhisper"
},
{
"path": "model_manage.py",
"chars": 882,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/12/1\n# @Author : hrwhisper\nfrom sklearn.externals import joblib\n\n\nclass BowTr"
},
{
"path": "readme.md",
"chars": 2772,
"preview": "\n## 环境 ##\n\n- jieba\n - pip install jieba \n- sklearn\n - pip install -U scikit-learn\n- python3.5\n\n\n\n## 运行方法\n- -c classfil"
},
{
"path": "test.py",
"chars": 5650,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/10/16\n# @Author : hrwhisper\nimport codecs\nfrom collections import Counter\nimp"
},
{
"path": "test_jieba.py",
"chars": 1248,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/12/5\n# @Author : hrwhisper\nimport codecs\nimport datetime\nfrom collections imp"
},
{
"path": "test_judge.py",
"chars": 2861,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/12/3\n# @Author : hrwhisper\nimport codecs\nimport sys\nfrom collections import C"
},
{
"path": "test_judge2.py",
"chars": 2735,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/12/3\n# @Author : hrwhisper\nimport codecs\nimport os\nimport sys\nfrom collection"
},
{
"path": "token_and_save_to_file.py",
"chars": 1529,
"preview": "# -*- coding: utf-8 -*-\n# @Date : 2016/10/7\n# @Author : hrwhisper\nfrom collections import Counter\nfrom multiprocessi"
}
]
// ... and 9 more files (download for full content)
About this extraction
This page contains the full source code of the hrwhisper/SpamMessage GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 23 files (80.6 MB), approximately 7.2k tokens, and a symbol index with 33 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.