[
  {
    "path": "KNN.py",
    "content": "\"\"\"\n    KNN算法也叫做K近邻算法，它的主要思想是：\n        计算测试样本与训练集中各个样本之间的距离，选择与测试样本距离最近的K个，然后统计这K个样本中出现标记最多的那个，\n        将这个标记作为测试样本的标记\n\"\"\"\n\nfrom sklearn.datasets import load_iris\nimport pandas as pd\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.neighbors import KNeighborsClassifier\n\n\ndef knn():\n    # 加载数据集\n    iris = load_iris()\n    feature = iris.data\n    target = iris.target\n    print(\"特征名称：\", iris.feature_names)\n    print(\"目标标记名：\", iris.target_names)\n    print(\"特征：\", feature.shape)\n    print(\"标记：\", target.shape)\n    # 特征预处理\n    # 判断有没有缺失值\n    print(pd.isnull(feature).any())\n    # 标准化\n    std = StandardScaler()\n    feature = std.fit_transform(feature)\n    # 划分数据集\n    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)\n    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)\n    print(\"训练集：\", x_train.shape, y_train.shape)\n    print(\"验证集：\", x_val.shape, y_val.shape)\n    print(\"测试集：\", x_test.shape, y_test.shape)\n    # 建立KNN模型\n    kn = KNeighborsClassifier(n_neighbors=5)\n    # 训练\n    kn.fit(x_train, y_train)\n    # 验证\n    score_val = kn.score(x_val, y_val)\n    print(\"在验证集上的得分：\", score_val)\n    # 测试\n    score_test = kn.score(x_test, y_test)\n    print(\"在测试集上的得分：\", score_test)\n    predict = kn.predict(x_test)\n    print(\"在测试集上的预测结果：\", predict)\n\n\nif __name__ == \"__main__\":\n    knn()\n"
  },
  {
    "path": "README.md",
    "content": "# sklearn_mochine_learning\n使用sklearn实现机器学习的算法，包括了线性回归、岭回归、逻辑回归、朴素贝叶斯、决策树、随机森林\n"
  },
  {
    "path": "SVM.py",
    "content": "\"\"\"\n    支持向量机：通过寻找划分超平面来进行分类的算法，这个划分超平面只由支持向量有关，与其他样本无关\n\"\"\"\n\n\nimport pandas as pd\nfrom sklearn.datasets import load_iris\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.svm import SVC\nfrom sklearn.metrics import classification_report\n\n\ndef svm():\n    # 加载数据集\n    iris = load_iris()\n\n    # 取出特征值和目标值\n    feature = iris.data\n    target = iris.target\n    print(\"特征：\", iris.feature_names)\n    print(\"目标：\", iris.target_names)\n\n    # 划分数据集\n    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)\n    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)\n    print(\"训练集：\", x_train.shape, y_train.shape)\n    print(\"验证集：\", x_val.shape, y_val.shape)\n    print(\"测试集：\", x_test.shape, y_test.shape)\n\n    # 建立模型\n    clf = SVC(kernel=\"linear\", C=0.4)\n    # 训练\n    clf.fit(x_train, y_train)\n    # 验证\n    score_val = clf.score(x_val, y_val)\n    print(\"在验证集上的得分：\", score_val)\n    # 测试\n    score_test = clf.score(x_test, y_test)\n    print(\"在测试集上的得分：\", score_test)\n    # 预测\n    predict = clf.predict(x_test)\n    print(\"预测结果：\", predict)\n    # 打印召回率、F1\n    print(classification_report(y_test, predict, labels=[0, 1, 2], target_names=iris.target_names))\n\n\nif __name__ == \"__main__\":\n    svm()\n\n\n"
  },
  {
    "path": "sklearn数据集获取.py",
    "content": "from sklearn.datasets import load_iris, load_boston, fetch_20newsgroups\n\n\n# 加载鸢尾花数据集（分类数据集）\ndef iris_datasets():\n    iris = load_iris()\n    feature = iris.data  # 获取特征值\n    target = iris.target  # 获取目标值\n    feature_names = iris.feature_names  # 获取特征名称\n    target_names = iris.target_names  # 获取目标值名称\n    print(\"特征值名称\", feature_names)\n    print(\"特征值\", feature)\n    print(\"目标值名称\", target_names)\n    print(\"目标值\", target)\n    print(\"数据集的描述信息\", iris.DESCR)\n\n\n# 获取波士顿房价数据集\ndef boston_datasets():\n    boston = load_boston()\n    feature = boston.data\n    target = boston.target\n    feature_names = boston.feature_names\n    #target_names = boston.target_names\n    print(\"特征值名称\", feature_names)\n    print(\"特征值\", feature)\n    #print(\"目标值名称\", target_names)\n    print(\"目标值\", target)\n    print(\"数据集的描述信息\", boston.DESCR)\n\n\n# 获取20newsgroups数据集\ndef newsgroups():\n    news = fetch_20newsgroups()\n    feature = news.data\n    target = news.target\n    #feature_names = news.feature_names\n    target_names = news.target_names\n    #print(\"特征值名称\", feature_names)\n    print(\"特征值\", feature)\n    print(\"目标值名称\", target_names)\n    print(\"目标值\", target)\n    print(\"数据集的描述信息\", news.DESCR)\n\nif __name__ == \"__main__\":\n    #iris_datasets()\n    #boston_datasets()\n    newsgroups()\n"
  },
  {
    "path": "决策树.py",
    "content": "\"\"\"\n    通过树形结构来实现分类的一种算法，关键在于如何选择最优属性\n    通常用三种方式：信息增益（ID3）、增益率（C4.5）、基尼系数（CART）\n\"\"\"\n\nimport pandas as pd\nimport numpy as np\nfrom sklearn.feature_extraction import DictVectorizer\nfrom sklearn.tree import DecisionTreeClassifier, export_graphviz\nfrom sklearn.model_selection import train_test_split\n\n\ndef de_tree():\n    # 加载数据\n    titan = pd.read_csv(\"E:/Desktop/机器学习_新/数据集/泰坦尼克数据集/train.csv\")\n    # print(titan.shape)\n    # pd.set_option(\"display.max_columns\", 100)  # 把dataframe中省略的部分显示出来\n    # print(titan.head(5))\n\n    # 构造特征值和目标值\n    feature = titan[[\"Pclass\", \"Age\", \"Fare\", \"Sex\"]]\n    target = titan[\"Survived\"]\n\n    # 特征预处理\n    # 查看有没有缺失值\n    print(pd.isnull(feature).any())\n    # 填充缺失值\n    Age = feature.pop(\"Age\")  # 取出，意思是取出来之后删除原来的\n    Age = Age.fillna(Age.mean())\n    # print(feature)\n    # feature.drop(\"Age\", axis=1, inplace=True)  # 删除一列\n    feature.insert(0, \"Age\", Age)\n    # print(pd.isnull(feature).any())\n\n    # 字典特征抽取\n    dv = DictVectorizer()\n    feature = dv.fit_transform(feature.to_dict(orient=\"records\"))\n    feature = feature.toarray()\n    print(feature)\n    print(dv.get_feature_names())\n\n    # 划分数据集\n    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)\n    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)\n    print(\"训练集：\", x_train.shape, y_train.shape)\n    print(\"验证集：\", x_val.shape, y_val.shape)\n    print(\"测试集：\", x_test.shape, y_test.shape)\n\n    # 建立模型\n    tree = DecisionTreeClassifier(max_depth=5)\n\n    # 训练\n    tree.fit(x_train, y_train)\n\n    # 验证\n    score = tree.score(x_val, y_val)\n    print(\"在验证集上的得分：\", score)\n\n    # 预测\n    score_test = tree.score(x_test, y_test)\n    print(\"在测试集上的得分：\", score_test)\n    predict = tree.predict(x_test)\n    print(\"测试结果：\", predict)\n\n    # 保存树结构\n    export_graphviz(tree, out_file=\"E:/Desktop/开题报告/tree.dot\", feature_names=['Age', 'Fare', 'Pclass', 'Sex=female', 'Sex=male'])\n\n    # 将保存的dot文件转成png文件，查看树结构\n    # dot -Tpng tree.dot -o tree.png\n\n\nif __name__ == \"__main__\":\n    de_tree()"
  },
  {
    "path": "划分数据集.py",
    "content": "from sklearn.datasets import load_iris\nfrom sklearn.model_selection import train_test_split\n\n\n# 将数据集划分成训练集、验证集、测试集\ndef split_datasets():\n    iris = load_iris()\n    feature = iris.data\n    target = iris.target\n    print(\"特征值：\", type(feature))\n    print(\"目标值：\", type(target))\n    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)  # 传的参数必须是numpy.ndarray或者pandas.dataframes，但是必须是传入特征值和目标值，不能一起传入\n    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)\n    print(\"训练集：\", x_train.shape, y_train.shape)\n    print(\"验证集：\", x_val.shape, y_val.shape)\n    print(\"测试集：\", x_test.shape, y_test.shape)\n\n\nif __name__ == \"__main__\":\n    split_datasets()\n\n"
  },
  {
    "path": "岭回归.py",
    "content": "\"\"\"\n    带有正则化的线性回归\n\"\"\"\n\n\nimport pandas as pd\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import Ridge\nfrom sklearn.metrics import mean_squared_error\n\n\ndef ridge():\n    # 加载数据集\n    boston = load_boston()\n    feature = boston.data\n    target = boston.target\n\n    # 划分数据集\n    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)\n\n    # 标准化\n    std_x = StandardScaler()\n    x_train = std_x.fit_transform(x_train)\n    x_test = std_x.transform(x_test)\n\n    std_y = StandardScaler()\n    y_train = std_y.fit_transform(y_train.reshape(-1, 1))\n    y_test = std_y.transform(y_test.reshape(-1, 1))\n\n    # 建立模型\n    rd = Ridge(alpha=1.0)\n    rd.fit(x_train, y_train)\n\n    # 预测结果\n    y_predict = rd.predict(x_test)\n    y_predict_inverse = std_y.inverse_transform(y_predict)\n    print(y_predict_inverse)\n\n    # 均方误差\n    error = mean_squared_error(y_test, y_predict)\n    print(\"均分误差：\", error)\n\n\nif __name__ == \"__main__\":\n    ridge()\n"
  },
  {
    "path": "朴素贝叶斯.py",
    "content": "\"\"\"\n    贝叶斯分类器的主要思想：\n        已知一个含有标记的数据集，此时来了一个测试样本，我们知道测试样本的特征，需要预测标记，\n        若我们能够求出这个样本属于各个类别的概率，那么从中选择概率最大的就可以了，那么就是求P(c|x)，\n        先用全概率公式P(c|x)=P(x,c)/P(x)，再用条件概率公式P(c|x)=P(x,c)/P(x)=P(x|c)*P(c)/P(x)，\n        对于同一个测试样本P(x)都是相同的，因此分母不是我们需要关心的。P(c)很好求，就是在数据集当中某个类别出现的概率\n        最难求的就是P(x|c)，朴素贝叶斯的思想就是假设各个特征之间相互独立，那么P(x|c)就等于P(x1|c)*P(x2|c)...，这样就可以求解了\n\"\"\"\n\nimport numpy as np\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.decomposition import PCA\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.naive_bayes import MultinomialNB\n\n\ndef bayes():\n    # 加载数据集（文本数据集）\n    news = fetch_20newsgroups()\n    feature = news.data\n    target = news.target\n    print(\"特征：\", len(feature))\n    print(\"目标：\", len(target))\n    print(\"目标值的含义：\", news.target_names)\n    # 文本特征抽取\n    tf = TfidfVectorizer()\n    feature = tf.fit_transform(feature)\n    feature = feature.toarray()\n    print(feature.shape)\n    print(feature.dtype)\n    feature = feature.astype(np.uint8)\n    print(feature.dtype)\n    # 特征降维\n    # pca\n    pca = PCA(n_components=100)\n    feature = pca.fit_transform(feature)\n    print(feature.shape)\n    # 划分数据集\n    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)\n    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)\n    print(\"训练集：\", x_train.shape)\n    print(\"验证集：\", x_val.shape)\n    print(\"测试集：\", x_test.shape)\n    # 建立贝叶斯模型\n    # alapha是拉普拉斯平滑系数，防止计算的概率是0\n    mlt = MultinomialNB(alpha=1.0)\n    # 训练\n    mlt.fit(x_train, y_train)\n    # 验证\n    score_val = mlt.score(x_val, y_val)\n    print(\"在验证集上的得分：\", score_val)\n    # 预测\n    score_test = mlt.score(x_test, y_test)\n    print(\"在测试集上的得分：\", score_test)\n    predict = mlt.predict(x_test)\n    print(\"测试结果：\", predict)\n\n\nif __name__ == \"__main__\":\n    bayes()\n\n"
  },
  {
    "path": "特征工程.py",
    "content": "\"\"\"\n特征工程：\n    特征抽取：\n        字典特征抽取\n        文本特征抽取\n    特征预处理：\n        归一化\n        标准化\n        缺失值处理\n    特征降维：\n        过滤式\n        主成分分析\n\"\"\"\n\nimport numpy as np\nimport pandas as pd\n\n\n# ndarray与dataframe之间的相互转换\ndef nd_da():\n    data = [[1, 2], [4, 5], [7, 8]]\n    print(type(data))\n    print(\"列表：\", data)\n\n    # 列表转ndarray\n    nd_data = np.array(data)\n    print(type(nd_data))\n    print(\"ndarray：\", nd_data)\n\n    # ndarray转dataframe\n    da_data = pd.DataFrame(nd_data)\n    print(type(da_data))\n    da_data.columns = [\"a\", \"b\"]\n    da_data.index = [\"A\", \"B\", \"C\"]\n    print(\"DataFrame:\", da_data)\n\n    # dataframe转ndarray\n    np_data = np.array(da_data)\n    print(type(np_data))\n    print(np_data)\n\n\n# 字典特征抽取：针对特征值是非数值型的特征,进行one_hot编码\nfrom sklearn.feature_extraction import DictVectorizer\ndef dictvec():\n    data = [[\"北京\", 12], [\"上海\", 50], [\"深圳\", 100], [\"宣城\", 1000]]\n    data = np.array(data)  # 将列表转成numpy.ndarray\n    data = pd.DataFrame(data)  # 将ndarray转成dataframe\n    data.columns = [\"city\", \"people\"]\n    print(\"dataframe：\", data)\n    print(type(data))\n    dict = DictVectorizer(sparse=False)\n    result = dict.fit_transform(data.to_dict(orient=\"records\"))  # 必须这样传，将dataframe转成字典\n    print(\"字典特征抽取之后的结果：\", result)\n    print(dict.get_feature_names())\n\n\n# 文本特征抽取：针对特征值是文本的情况\nfrom sklearn.feature_extraction.text import CountVectorizer\nfrom sklearn.feature_extraction.text import TfidfVectorizer\ndef text():\n    data = [\"life is short, I like python\", \"life is too long, I dislike python\"]\n    # 统计次数\n    cv = CountVectorizer()\n    result = cv.fit_transform(data)  # 默认返回稀疏矩阵\n    result = result.toarray()  # 将稀疏矩阵转成密集矩阵\n    print(result)\n    print(type(result))\n    print(cv.get_feature_names())\n    # 统计重要性\n    tf = TfidfVectorizer()\n    result_tf = tf.fit_transform(data)\n    result_tf = result_tf.toarray()\n    print(result_tf)\n    print(type(result_tf))\n    print(tf.get_feature_names())\n\n\n# 特征预处理\n# 缺失值处理\ndef missing_value():\n    data1 = pd.DataFrame({\"一班\": [90, 80, 66, 75, 99, 55, 76, 78, 98, None, 90],\n                          \"二班\": [75, 98, 100, None, 77, 45, None, 66, 56, 80, 57],\n                          \"三班\": [45, 89, 77, 67, 65, 100, None, 75, 64, 88, 99],\n                          \"四班\": [45, 89, 77, 67, 65, 100, 45, 75, 64, 88, 99]})\n\n    data2 = pd.DataFrame({\"一班\": [90, 80, 66, 75, 99, 55, 76, 78, 98, np.nan, 90],\n                          \"二班\": [75, 98, 100, np.nan, 77, 45, np.nan, 66, 56, 80, 57],\n                          \"三班\": [45, 89, 77, 67, 65, 100, np.nan, 75, 64, 88, 99],\n                          \"四班\": [45, 89, 77, 67, 65, 100, 45, 75, 64, 88, 99]})\n\n    data3 = pd.DataFrame({\"一班\": [90, 80, 66, 75, 99, 55, 76, 78, 98, \"null\", 90],\n                          \"二班\": [75, 98, 100, \"null\", 77, 45, \"null\", 66, 56, 80, 57],\n                          \"三班\": [45, 89, 77, 67, 65, 100, \"null\", 75, 64, 88, 99],\n                          \"四班\": [45, 89, 77, 67, 65, 100, 45, 75, 64, 88, 99]})\n    # 缺失值是None、np.nan都是可以识别出来的，打印的时候显示NaN，但是其他类型的缺失值是无法识别的\n    # print(data1)\n    # print(data2)\n    # print(data3)\n    # 如果遇到的是data3这种类型的缺失值，那么首先要用np.nan替换掉缺失值\n    data3.replace(\"null\", np.nan, inplace=True)\n\n    # 判断有没有缺失值\n    print(pd.isnull(data3).any())\n\n    # 处理缺失值\n    # 删除\n    # data3.dropna(axis=0, how=\"any\", inplace=True)\n    # print(data3)\n    # 填充\n    data3.fillna(data3.mean(), inplace=True)\n    print(data3)\n\n# 归一化\nfrom sklearn.preprocessing import MinMaxScaler\ndef min_max():\n    data = [[1, 2, 3],[4, 5, 6],[7, 8, 9]]\n    mm = MinMaxScaler(feature_range=(0, 1))\n    result = mm.fit_transform(data)\n    print(result)\n\n# 标准化\nfrom sklearn.preprocessing import StandardScaler\ndef standard():\n    data = [[1, 2, 3],[4, 5, 6],[7, 8, 9]]\n    std = StandardScaler()\n    result = std.fit_transform(data)\n    print(result)\n\n\n# 特征抽取：过滤式\nfrom sklearn.feature_selection import VarianceThreshold\ndef var():\n    data = [[1, 2, 3],[1, 4, 5],[1, 7, 8]]\n    v = VarianceThreshold(threshold=0)\n    result = v.fit_transform(data)\n    print(result)\n\n\n# 特征降维：PCA\nfrom sklearn.decomposition import PCA\ndef pca():\n    data = [[1, 2, 4, 5], [4, 5, 4, 2], [2, 4, 1, 4]]\n    p = PCA(n_components=0.95)\n    result = p.fit_transform(data)\n    print(result)\n\n\nif __name__ == \"__main__\":\n    nd_da()\n    dictvec()\n    text()\n    missing_value()\n    min_max()\n    standard()\n    var()\n    pca()\n\n\n"
  },
  {
    "path": "线性回归.py",
    "content": "\"\"\"\n    线性回归：通过构建线性模型来进行预测的一种回归算法\n\"\"\"\n\n\nimport pandas as pd\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LinearRegression, SGDRegressor\nfrom sklearn.metrics import mean_squared_error\n\n\ndef linear():\n    # 加载数据集\n    boston = load_boston()\n    feature = boston.data\n    target = boston.target\n\n    # 划分数据集\n    x_train, x_test, y_train, y_test_ori = train_test_split(feature, target, test_size=0.25)\n\n    # 标准化\n    std1 = StandardScaler()\n    x_train = std1.fit_transform(x_train)\n    x_test = std1.transform(x_test)\n\n    std2 = StandardScaler()\n    y_train = std2.fit_transform(y_train.reshape(-1, 1))  # 必须传二维\n    y_test = std2.transform(y_test_ori.reshape(-1, 1))\n\n    # 正规方程的解法\n    # 建立模型\n    lr = LinearRegression()  # 通过公式求解\n    lr.fit(x_train, y_train)\n\n    # 预测结果\n    y_predict = lr.predict(x_test)  # 这个结果是标准化之后的结果，需要转换\n    y_predict_inverse = std2.inverse_transform(y_predict)\n    print(y_predict_inverse)\n\n    # 均方误差\n    error = mean_squared_error(y_test_ori, y_predict_inverse)\n    print(\"均方误差：\", error)\n\n    # 梯度下降算法求解\n    sgd = SGDRegressor()  # 通过梯度下降求解\n    sgd.fit(x_train, y_train)\n\n    # 预测结果\n    y_predict_sgd = sgd.predict(x_test)\n    y_predict_sgd_inverse = std2.inverse_transform(y_predict_sgd)  # 反归一化\n    print(\"sgd预测结果：\", y_predict_sgd_inverse)\n\n    # 均方误差\n    error_sgd = mean_squared_error(y_test_ori, y_predict_sgd_inverse)\n    print(\"sgd的均分误差：\", error_sgd)\n\n\nif __name__ == \"__main__\":\n    linear()\n"
  },
  {
    "path": "逻辑回归.py",
    "content": "\"\"\"\n    逻辑回归：将线性回归函数的输出，作为Sigmoid函数的输入，然后输出为0-1之间的\n\"\"\"\n\n\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import classification_report\n\n\ndef LR():\n    # 加载数据集\n    cancer = pd.read_csv(\"E:/Desktop/机器学习_新/数据集/癌症数据集/Prostate_Cancer.csv\")\n    pd.set_option(\"display.max_columns\", 100)\n    print(cancer.head(5))\n    print(\"特征值名称：\", list(cancer.columns))\n\n    # 提取特征值和目标值\n    feature = cancer[list(cancer.columns)[2:]]\n    print(feature.head(5))\n    target = cancer[list(cancer.columns)[1]]\n    print(target.head(5))\n\n    # 将目标值进行0-1化\n    target.replace(\"M\", 0, inplace=True)\n    target.replace(\"B\", 1, inplace=True)\n    print(target.head(5))\n\n    # 划分数据集\n    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)\n    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)\n    print(\"训练集：\", x_train.shape, y_train.shape)\n    print(\"验证集：\", x_val.shape, y_val.shape)\n    print(\"测试集：\", x_test.shape, y_test.shape)\n\n    # 标准化\n    std = StandardScaler()\n    x_train = std.fit_transform(x_train)\n    x_val = std.transform(x_val)\n    x_test = std.transform(x_test)\n\n    # 建立模型\n    lg = LogisticRegression()\n    # 训练\n    lg.fit(x_train, y_train)\n    # 验证\n    score_val = lg.score(x_val, y_val)\n    print(\"在验证集上的得分：\", score_val)\n    # 测试\n    score_test = lg.score(x_test, y_test)\n    print(\"在测试集上的得分：\", score_test)\n    # 预测\n    predict = lg.predict(x_test)\n    print(predict)\n    # 打印召回率，F1\n    print(classification_report(y_test, predict, labels=[0, 1], target_names=[\"良性\", \"恶性\"]))\n\n\nif __name__ == \"__main__\":\n    LR()\n"
  },
  {
    "path": "随机森林.py",
    "content": "\"\"\"\n    随机森林是一种同质的集成学习算法，通过构建多个决策树，然后结合多个决策树的结果，得到更好的预测\n\"\"\"\n\nimport pandas as pd\nfrom sklearn.feature_extraction import DictVectorizer\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestClassifier\nfrom sklearn.model_selection import GridSearchCV\n\n\ndef forest():\n    # 加载数据\n    titan = pd.read_csv(\"E:/Desktop/机器学习_新/数据集/泰坦尼克数据集/train.csv\")\n\n    # 构造特征值和目标值\n    feature = titan[[\"Pclass\", \"Age\", \"Fare\", \"Sex\"]]\n    target = titan[\"Survived\"]\n\n    # 特征预处理\n    # 查看有没有缺失值\n    print(pd.isnull(feature).any())\n    # 填充缺失值\n    Age = feature.pop(\"Age\")  # 取出，意思是取出来之后删除原来的\n    Age = Age.fillna(Age.mean())\n    feature.insert(0, \"Age\", Age)\n\n    # 字典特征抽取\n    dv = DictVectorizer()\n    feature = dv.fit_transform(feature.to_dict(orient=\"records\"))\n    feature = feature.toarray()\n    print(feature)\n    print(dv.get_feature_names())\n\n    # 划分数据集\n    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)\n    print(\"训练集：\", x_train.shape, y_train.shape)\n    print(\"测试集：\", x_test.shape, y_test.shape)\n\n    # 建立模型\n    rf = RandomForestClassifier()\n\n    # 超参数搜索\n    param = {\"n_estimators\":[10, 20, 30, 40], \"max_depth\":[25, 35, 45]}\n    gc = GridSearchCV(rf, param_grid=param, cv=5)\n\n    # 训练\n    gc.fit(x_train, y_train)\n\n    # 交叉验证网格搜索的结果\n    print(\"在测试集上的准确率：\", gc.score(x_test, y_test))\n    print(\"在验证集上的准确率：\", gc.best_score_)\n    print(\"最好的模型参数：\", gc.best_params_)\n    print(\"最好的模型：\", gc.best_estimator_)\n\n\nif __name__ == \"__main__\":\n    forest()\n\n"
  }
]