Repository: goodboyv/Sklearn_Mochine_leanring Branch: master Commit: c25a5ad49001 Files: 12 Total size: 18.3 KB Directory structure: gitextract_kub6qg91/ ├── KNN.py ├── README.md ├── SVM.py ├── sklearn数据集获取.py ├── 决策树.py ├── 划分数据集.py ├── 岭回归.py ├── 朴素贝叶斯.py ├── 特征工程.py ├── 线性回归.py ├── 逻辑回归.py └── 随机森林.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: KNN.py ================================================ """ KNN算法也叫做K近邻算法,它的主要思想是: 计算测试样本与训练集中各个样本之间的距离,选择与测试样本距离最近的K个,然后统计这K个样本中出现标记最多的那个, 将这个标记作为测试样本的标记 """ from sklearn.datasets import load_iris import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier def knn(): # 加载数据集 iris = load_iris() feature = iris.data target = iris.target print("特征名称:", iris.feature_names) print("目标标记名:", iris.target_names) print("特征:", feature.shape) print("标记:", target.shape) # 特征预处理 # 判断有没有缺失值 print(pd.isnull(feature).any()) # 标准化 std = StandardScaler() feature = std.fit_transform(feature) # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25) print("训练集:", x_train.shape, y_train.shape) print("验证集:", x_val.shape, y_val.shape) print("测试集:", x_test.shape, y_test.shape) # 建立KNN模型 kn = KNeighborsClassifier(n_neighbors=5) # 训练 kn.fit(x_train, y_train) # 验证 score_val = kn.score(x_val, y_val) print("在验证集上的得分:", score_val) # 测试 score_test = kn.score(x_test, y_test) print("在测试集上的得分:", score_test) predict = kn.predict(x_test) print("在测试集上的预测结果:", predict) if __name__ == "__main__": knn() ================================================ FILE: README.md ================================================ # sklearn_mochine_learning 使用sklearn实现机器学习的算法,包括了线性回归、岭回归、逻辑回归、朴素贝叶斯、决策树、随机森林 ================================================ FILE: SVM.py ================================================ """ 支持向量机:通过寻找划分超平面来进行分类的算法,这个划分超平面只由支持向量有关,与其他样本无关 """ import pandas as pd from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn.metrics import classification_report def svm(): # 加载数据集 iris = load_iris() # 取出特征值和目标值 feature = iris.data target = iris.target print("特征:", iris.feature_names) print("目标:", iris.target_names) # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25) print("训练集:", x_train.shape, y_train.shape) print("验证集:", x_val.shape, y_val.shape) print("测试集:", x_test.shape, y_test.shape) # 建立模型 clf = SVC(kernel="linear", C=0.4) # 训练 clf.fit(x_train, y_train) # 验证 score_val = clf.score(x_val, y_val) print("在验证集上的得分:", score_val) # 测试 score_test = clf.score(x_test, y_test) print("在测试集上的得分:", score_test) # 预测 predict = clf.predict(x_test) print("预测结果:", predict) # 打印召回率、F1 print(classification_report(y_test, predict, labels=[0, 1, 2], target_names=iris.target_names)) if __name__ == "__main__": svm() ================================================ FILE: sklearn数据集获取.py ================================================ from sklearn.datasets import load_iris, load_boston, fetch_20newsgroups # 加载鸢尾花数据集(分类数据集) def iris_datasets(): iris = load_iris() feature = iris.data # 获取特征值 target = iris.target # 获取目标值 feature_names = iris.feature_names # 获取特征名称 target_names = iris.target_names # 获取目标值名称 print("特征值名称", feature_names) print("特征值", feature) print("目标值名称", target_names) print("目标值", target) print("数据集的描述信息", iris.DESCR) # 获取波士顿房价数据集 def boston_datasets(): boston = load_boston() feature = boston.data target = boston.target feature_names = boston.feature_names #target_names = boston.target_names print("特征值名称", feature_names) print("特征值", feature) #print("目标值名称", target_names) print("目标值", target) print("数据集的描述信息", boston.DESCR) # 获取20newsgroups数据集 def newsgroups(): news = fetch_20newsgroups() feature = news.data target = news.target #feature_names = news.feature_names target_names = news.target_names #print("特征值名称", feature_names) print("特征值", feature) print("目标值名称", target_names) print("目标值", target) print("数据集的描述信息", news.DESCR) if __name__ == "__main__": #iris_datasets() #boston_datasets() newsgroups() ================================================ FILE: 决策树.py ================================================ """ 通过树形结构来实现分类的一种算法,关键在于如何选择最优属性 通常用三种方式:信息增益(ID3)、增益率(C4.5)、基尼系数(CART) """ import pandas as pd import numpy as np from sklearn.feature_extraction import DictVectorizer from sklearn.tree import DecisionTreeClassifier, export_graphviz from sklearn.model_selection import train_test_split def de_tree(): # 加载数据 titan = pd.read_csv("E:/Desktop/机器学习_新/数据集/泰坦尼克数据集/train.csv") # print(titan.shape) # pd.set_option("display.max_columns", 100) # 把dataframe中省略的部分显示出来 # print(titan.head(5)) # 构造特征值和目标值 feature = titan[["Pclass", "Age", "Fare", "Sex"]] target = titan["Survived"] # 特征预处理 # 查看有没有缺失值 print(pd.isnull(feature).any()) # 填充缺失值 Age = feature.pop("Age") # 取出,意思是取出来之后删除原来的 Age = Age.fillna(Age.mean()) # print(feature) # feature.drop("Age", axis=1, inplace=True) # 删除一列 feature.insert(0, "Age", Age) # print(pd.isnull(feature).any()) # 字典特征抽取 dv = DictVectorizer() feature = dv.fit_transform(feature.to_dict(orient="records")) feature = feature.toarray() print(feature) print(dv.get_feature_names()) # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25) print("训练集:", x_train.shape, y_train.shape) print("验证集:", x_val.shape, y_val.shape) print("测试集:", x_test.shape, y_test.shape) # 建立模型 tree = DecisionTreeClassifier(max_depth=5) # 训练 tree.fit(x_train, y_train) # 验证 score = tree.score(x_val, y_val) print("在验证集上的得分:", score) # 预测 score_test = tree.score(x_test, y_test) print("在测试集上的得分:", score_test) predict = tree.predict(x_test) print("测试结果:", predict) # 保存树结构 export_graphviz(tree, out_file="E:/Desktop/开题报告/tree.dot", feature_names=['Age', 'Fare', 'Pclass', 'Sex=female', 'Sex=male']) # 将保存的dot文件转成png文件,查看树结构 # dot -Tpng tree.dot -o tree.png if __name__ == "__main__": de_tree() ================================================ FILE: 划分数据集.py ================================================ from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split # 将数据集划分成训练集、验证集、测试集 def split_datasets(): iris = load_iris() feature = iris.data target = iris.target print("特征值:", type(feature)) print("目标值:", type(target)) x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25) # 传的参数必须是numpy.ndarray或者pandas.dataframes,但是必须是传入特征值和目标值,不能一起传入 x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25) print("训练集:", x_train.shape, y_train.shape) print("验证集:", x_val.shape, y_val.shape) print("测试集:", x_test.shape, y_test.shape) if __name__ == "__main__": split_datasets() ================================================ FILE: 岭回归.py ================================================ """ 带有正则化的线性回归 """ import pandas as pd from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import Ridge from sklearn.metrics import mean_squared_error def ridge(): # 加载数据集 boston = load_boston() feature = boston.data target = boston.target # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25) # 标准化 std_x = StandardScaler() x_train = std_x.fit_transform(x_train) x_test = std_x.transform(x_test) std_y = StandardScaler() y_train = std_y.fit_transform(y_train.reshape(-1, 1)) y_test = std_y.transform(y_test.reshape(-1, 1)) # 建立模型 rd = Ridge(alpha=1.0) rd.fit(x_train, y_train) # 预测结果 y_predict = rd.predict(x_test) y_predict_inverse = std_y.inverse_transform(y_predict) print(y_predict_inverse) # 均方误差 error = mean_squared_error(y_test, y_predict) print("均分误差:", error) if __name__ == "__main__": ridge() ================================================ FILE: 朴素贝叶斯.py ================================================ """ 贝叶斯分类器的主要思想: 已知一个含有标记的数据集,此时来了一个测试样本,我们知道测试样本的特征,需要预测标记, 若我们能够求出这个样本属于各个类别的概率,那么从中选择概率最大的就可以了,那么就是求P(c|x), 先用全概率公式P(c|x)=P(x,c)/P(x),再用条件概率公式P(c|x)=P(x,c)/P(x)=P(x|c)*P(c)/P(x), 对于同一个测试样本P(x)都是相同的,因此分母不是我们需要关心的。P(c)很好求,就是在数据集当中某个类别出现的概率 最难求的就是P(x|c),朴素贝叶斯的思想就是假设各个特征之间相互独立,那么P(x|c)就等于P(x1|c)*P(x2|c)...,这样就可以求解了 """ import numpy as np from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB def bayes(): # 加载数据集(文本数据集) news = fetch_20newsgroups() feature = news.data target = news.target print("特征:", len(feature)) print("目标:", len(target)) print("目标值的含义:", news.target_names) # 文本特征抽取 tf = TfidfVectorizer() feature = tf.fit_transform(feature) feature = feature.toarray() print(feature.shape) print(feature.dtype) feature = feature.astype(np.uint8) print(feature.dtype) # 特征降维 # pca pca = PCA(n_components=100) feature = pca.fit_transform(feature) print(feature.shape) # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25) print("训练集:", x_train.shape) print("验证集:", x_val.shape) print("测试集:", x_test.shape) # 建立贝叶斯模型 # alapha是拉普拉斯平滑系数,防止计算的概率是0 mlt = MultinomialNB(alpha=1.0) # 训练 mlt.fit(x_train, y_train) # 验证 score_val = mlt.score(x_val, y_val) print("在验证集上的得分:", score_val) # 预测 score_test = mlt.score(x_test, y_test) print("在测试集上的得分:", score_test) predict = mlt.predict(x_test) print("测试结果:", predict) if __name__ == "__main__": bayes() ================================================ FILE: 特征工程.py ================================================ """ 特征工程: 特征抽取: 字典特征抽取 文本特征抽取 特征预处理: 归一化 标准化 缺失值处理 特征降维: 过滤式 主成分分析 """ import numpy as np import pandas as pd # ndarray与dataframe之间的相互转换 def nd_da(): data = [[1, 2], [4, 5], [7, 8]] print(type(data)) print("列表:", data) # 列表转ndarray nd_data = np.array(data) print(type(nd_data)) print("ndarray:", nd_data) # ndarray转dataframe da_data = pd.DataFrame(nd_data) print(type(da_data)) da_data.columns = ["a", "b"] da_data.index = ["A", "B", "C"] print("DataFrame:", da_data) # dataframe转ndarray np_data = np.array(da_data) print(type(np_data)) print(np_data) # 字典特征抽取:针对特征值是非数值型的特征,进行one_hot编码 from sklearn.feature_extraction import DictVectorizer def dictvec(): data = [["北京", 12], ["上海", 50], ["深圳", 100], ["宣城", 1000]] data = np.array(data) # 将列表转成numpy.ndarray data = pd.DataFrame(data) # 将ndarray转成dataframe data.columns = ["city", "people"] print("dataframe:", data) print(type(data)) dict = DictVectorizer(sparse=False) result = dict.fit_transform(data.to_dict(orient="records")) # 必须这样传,将dataframe转成字典 print("字典特征抽取之后的结果:", result) print(dict.get_feature_names()) # 文本特征抽取:针对特征值是文本的情况 from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer def text(): data = ["life is short, I like python", "life is too long, I dislike python"] # 统计次数 cv = CountVectorizer() result = cv.fit_transform(data) # 默认返回稀疏矩阵 result = result.toarray() # 将稀疏矩阵转成密集矩阵 print(result) print(type(result)) print(cv.get_feature_names()) # 统计重要性 tf = TfidfVectorizer() result_tf = tf.fit_transform(data) result_tf = result_tf.toarray() print(result_tf) print(type(result_tf)) print(tf.get_feature_names()) # 特征预处理 # 缺失值处理 def missing_value(): data1 = pd.DataFrame({"一班": [90, 80, 66, 75, 99, 55, 76, 78, 98, None, 90], "二班": [75, 98, 100, None, 77, 45, None, 66, 56, 80, 57], "三班": [45, 89, 77, 67, 65, 100, None, 75, 64, 88, 99], "四班": [45, 89, 77, 67, 65, 100, 45, 75, 64, 88, 99]}) data2 = pd.DataFrame({"一班": [90, 80, 66, 75, 99, 55, 76, 78, 98, np.nan, 90], "二班": [75, 98, 100, np.nan, 77, 45, np.nan, 66, 56, 80, 57], "三班": [45, 89, 77, 67, 65, 100, np.nan, 75, 64, 88, 99], "四班": [45, 89, 77, 67, 65, 100, 45, 75, 64, 88, 99]}) data3 = pd.DataFrame({"一班": [90, 80, 66, 75, 99, 55, 76, 78, 98, "null", 90], "二班": [75, 98, 100, "null", 77, 45, "null", 66, 56, 80, 57], "三班": [45, 89, 77, 67, 65, 100, "null", 75, 64, 88, 99], "四班": [45, 89, 77, 67, 65, 100, 45, 75, 64, 88, 99]}) # 缺失值是None、np.nan都是可以识别出来的,打印的时候显示NaN,但是其他类型的缺失值是无法识别的 # print(data1) # print(data2) # print(data3) # 如果遇到的是data3这种类型的缺失值,那么首先要用np.nan替换掉缺失值 data3.replace("null", np.nan, inplace=True) # 判断有没有缺失值 print(pd.isnull(data3).any()) # 处理缺失值 # 删除 # data3.dropna(axis=0, how="any", inplace=True) # print(data3) # 填充 data3.fillna(data3.mean(), inplace=True) print(data3) # 归一化 from sklearn.preprocessing import MinMaxScaler def min_max(): data = [[1, 2, 3],[4, 5, 6],[7, 8, 9]] mm = MinMaxScaler(feature_range=(0, 1)) result = mm.fit_transform(data) print(result) # 标准化 from sklearn.preprocessing import StandardScaler def standard(): data = [[1, 2, 3],[4, 5, 6],[7, 8, 9]] std = StandardScaler() result = std.fit_transform(data) print(result) # 特征抽取:过滤式 from sklearn.feature_selection import VarianceThreshold def var(): data = [[1, 2, 3],[1, 4, 5],[1, 7, 8]] v = VarianceThreshold(threshold=0) result = v.fit_transform(data) print(result) # 特征降维:PCA from sklearn.decomposition import PCA def pca(): data = [[1, 2, 4, 5], [4, 5, 4, 2], [2, 4, 1, 4]] p = PCA(n_components=0.95) result = p.fit_transform(data) print(result) if __name__ == "__main__": nd_da() dictvec() text() missing_value() min_max() standard() var() pca() ================================================ FILE: 线性回归.py ================================================ """ 线性回归:通过构建线性模型来进行预测的一种回归算法 """ import pandas as pd from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression, SGDRegressor from sklearn.metrics import mean_squared_error def linear(): # 加载数据集 boston = load_boston() feature = boston.data target = boston.target # 划分数据集 x_train, x_test, y_train, y_test_ori = train_test_split(feature, target, test_size=0.25) # 标准化 std1 = StandardScaler() x_train = std1.fit_transform(x_train) x_test = std1.transform(x_test) std2 = StandardScaler() y_train = std2.fit_transform(y_train.reshape(-1, 1)) # 必须传二维 y_test = std2.transform(y_test_ori.reshape(-1, 1)) # 正规方程的解法 # 建立模型 lr = LinearRegression() # 通过公式求解 lr.fit(x_train, y_train) # 预测结果 y_predict = lr.predict(x_test) # 这个结果是标准化之后的结果,需要转换 y_predict_inverse = std2.inverse_transform(y_predict) print(y_predict_inverse) # 均方误差 error = mean_squared_error(y_test_ori, y_predict_inverse) print("均方误差:", error) # 梯度下降算法求解 sgd = SGDRegressor() # 通过梯度下降求解 sgd.fit(x_train, y_train) # 预测结果 y_predict_sgd = sgd.predict(x_test) y_predict_sgd_inverse = std2.inverse_transform(y_predict_sgd) # 反归一化 print("sgd预测结果:", y_predict_sgd_inverse) # 均方误差 error_sgd = mean_squared_error(y_test_ori, y_predict_sgd_inverse) print("sgd的均分误差:", error_sgd) if __name__ == "__main__": linear() ================================================ FILE: 逻辑回归.py ================================================ """ 逻辑回归:将线性回归函数的输出,作为Sigmoid函数的输入,然后输出为0-1之间的 """ import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report def LR(): # 加载数据集 cancer = pd.read_csv("E:/Desktop/机器学习_新/数据集/癌症数据集/Prostate_Cancer.csv") pd.set_option("display.max_columns", 100) print(cancer.head(5)) print("特征值名称:", list(cancer.columns)) # 提取特征值和目标值 feature = cancer[list(cancer.columns)[2:]] print(feature.head(5)) target = cancer[list(cancer.columns)[1]] print(target.head(5)) # 将目标值进行0-1化 target.replace("M", 0, inplace=True) target.replace("B", 1, inplace=True) print(target.head(5)) # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25) x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25) print("训练集:", x_train.shape, y_train.shape) print("验证集:", x_val.shape, y_val.shape) print("测试集:", x_test.shape, y_test.shape) # 标准化 std = StandardScaler() x_train = std.fit_transform(x_train) x_val = std.transform(x_val) x_test = std.transform(x_test) # 建立模型 lg = LogisticRegression() # 训练 lg.fit(x_train, y_train) # 验证 score_val = lg.score(x_val, y_val) print("在验证集上的得分:", score_val) # 测试 score_test = lg.score(x_test, y_test) print("在测试集上的得分:", score_test) # 预测 predict = lg.predict(x_test) print(predict) # 打印召回率,F1 print(classification_report(y_test, predict, labels=[0, 1], target_names=["良性", "恶性"])) if __name__ == "__main__": LR() ================================================ FILE: 随机森林.py ================================================ """ 随机森林是一种同质的集成学习算法,通过构建多个决策树,然后结合多个决策树的结果,得到更好的预测 """ import pandas as pd from sklearn.feature_extraction import DictVectorizer from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV def forest(): # 加载数据 titan = pd.read_csv("E:/Desktop/机器学习_新/数据集/泰坦尼克数据集/train.csv") # 构造特征值和目标值 feature = titan[["Pclass", "Age", "Fare", "Sex"]] target = titan["Survived"] # 特征预处理 # 查看有没有缺失值 print(pd.isnull(feature).any()) # 填充缺失值 Age = feature.pop("Age") # 取出,意思是取出来之后删除原来的 Age = Age.fillna(Age.mean()) feature.insert(0, "Age", Age) # 字典特征抽取 dv = DictVectorizer() feature = dv.fit_transform(feature.to_dict(orient="records")) feature = feature.toarray() print(feature) print(dv.get_feature_names()) # 划分数据集 x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25) print("训练集:", x_train.shape, y_train.shape) print("测试集:", x_test.shape, y_test.shape) # 建立模型 rf = RandomForestClassifier() # 超参数搜索 param = {"n_estimators":[10, 20, 30, 40], "max_depth":[25, 35, 45]} gc = GridSearchCV(rf, param_grid=param, cv=5) # 训练 gc.fit(x_train, y_train) # 交叉验证网格搜索的结果 print("在测试集上的准确率:", gc.score(x_test, y_test)) print("在验证集上的准确率:", gc.best_score_) print("最好的模型参数:", gc.best_params_) print("最好的模型:", gc.best_estimator_) if __name__ == "__main__": forest()