master c25a5ad49001 cached
12 files
18.3 KB
6.7k tokens
20 symbols
1 requests
Download .txt
Repository: goodboyv/Sklearn_Mochine_leanring
Branch: master
Commit: c25a5ad49001
Files: 12
Total size: 18.3 KB

Directory structure:
gitextract_kub6qg91/

├── KNN.py
├── README.md
├── SVM.py
├── sklearn数据集获取.py
├── 决策树.py
├── 划分数据集.py
├── 岭回归.py
├── 朴素贝叶斯.py
├── 特征工程.py
├── 线性回归.py
├── 逻辑回归.py
└── 随机森林.py

================================================
FILE CONTENTS
================================================

================================================
FILE: KNN.py
================================================
"""
    KNN算法也叫做K近邻算法,它的主要思想是:
        计算测试样本与训练集中各个样本之间的距离,选择与测试样本距离最近的K个,然后统计这K个样本中出现标记最多的那个,
        将这个标记作为测试样本的标记
"""

from sklearn.datasets import load_iris
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier


def knn():
    # 加载数据集
    iris = load_iris()
    feature = iris.data
    target = iris.target
    print("特征名称:", iris.feature_names)
    print("目标标记名:", iris.target_names)
    print("特征:", feature.shape)
    print("标记:", target.shape)
    # 特征预处理
    # 判断有没有缺失值
    print(pd.isnull(feature).any())
    # 标准化
    std = StandardScaler()
    feature = std.fit_transform(feature)
    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)
    print("训练集:", x_train.shape, y_train.shape)
    print("验证集:", x_val.shape, y_val.shape)
    print("测试集:", x_test.shape, y_test.shape)
    # 建立KNN模型
    kn = KNeighborsClassifier(n_neighbors=5)
    # 训练
    kn.fit(x_train, y_train)
    # 验证
    score_val = kn.score(x_val, y_val)
    print("在验证集上的得分:", score_val)
    # 测试
    score_test = kn.score(x_test, y_test)
    print("在测试集上的得分:", score_test)
    predict = kn.predict(x_test)
    print("在测试集上的预测结果:", predict)


if __name__ == "__main__":
    knn()


================================================
FILE: README.md
================================================
# sklearn_mochine_learning
使用sklearn实现机器学习的算法,包括了线性回归、岭回归、逻辑回归、朴素贝叶斯、决策树、随机森林


================================================
FILE: SVM.py
================================================
"""
    支持向量机:通过寻找划分超平面来进行分类的算法,这个划分超平面只由支持向量有关,与其他样本无关
"""


import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report


def svm():
    # 加载数据集
    iris = load_iris()

    # 取出特征值和目标值
    feature = iris.data
    target = iris.target
    print("特征:", iris.feature_names)
    print("目标:", iris.target_names)

    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)
    print("训练集:", x_train.shape, y_train.shape)
    print("验证集:", x_val.shape, y_val.shape)
    print("测试集:", x_test.shape, y_test.shape)

    # 建立模型
    clf = SVC(kernel="linear", C=0.4)
    # 训练
    clf.fit(x_train, y_train)
    # 验证
    score_val = clf.score(x_val, y_val)
    print("在验证集上的得分:", score_val)
    # 测试
    score_test = clf.score(x_test, y_test)
    print("在测试集上的得分:", score_test)
    # 预测
    predict = clf.predict(x_test)
    print("预测结果:", predict)
    # 打印召回率、F1
    print(classification_report(y_test, predict, labels=[0, 1, 2], target_names=iris.target_names))


if __name__ == "__main__":
    svm()




================================================
FILE: sklearn数据集获取.py
================================================
from sklearn.datasets import load_iris, load_boston, fetch_20newsgroups


# 加载鸢尾花数据集(分类数据集)
def iris_datasets():
    iris = load_iris()
    feature = iris.data  # 获取特征值
    target = iris.target  # 获取目标值
    feature_names = iris.feature_names  # 获取特征名称
    target_names = iris.target_names  # 获取目标值名称
    print("特征值名称", feature_names)
    print("特征值", feature)
    print("目标值名称", target_names)
    print("目标值", target)
    print("数据集的描述信息", iris.DESCR)


# 获取波士顿房价数据集
def boston_datasets():
    boston = load_boston()
    feature = boston.data
    target = boston.target
    feature_names = boston.feature_names
    #target_names = boston.target_names
    print("特征值名称", feature_names)
    print("特征值", feature)
    #print("目标值名称", target_names)
    print("目标值", target)
    print("数据集的描述信息", boston.DESCR)


# 获取20newsgroups数据集
def newsgroups():
    news = fetch_20newsgroups()
    feature = news.data
    target = news.target
    #feature_names = news.feature_names
    target_names = news.target_names
    #print("特征值名称", feature_names)
    print("特征值", feature)
    print("目标值名称", target_names)
    print("目标值", target)
    print("数据集的描述信息", news.DESCR)

if __name__ == "__main__":
    #iris_datasets()
    #boston_datasets()
    newsgroups()


================================================
FILE: 决策树.py
================================================
"""
    通过树形结构来实现分类的一种算法,关键在于如何选择最优属性
    通常用三种方式:信息增益(ID3)、增益率(C4.5)、基尼系数(CART)
"""

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split


def de_tree():
    # 加载数据
    titan = pd.read_csv("E:/Desktop/机器学习_新/数据集/泰坦尼克数据集/train.csv")
    # print(titan.shape)
    # pd.set_option("display.max_columns", 100)  # 把dataframe中省略的部分显示出来
    # print(titan.head(5))

    # 构造特征值和目标值
    feature = titan[["Pclass", "Age", "Fare", "Sex"]]
    target = titan["Survived"]

    # 特征预处理
    # 查看有没有缺失值
    print(pd.isnull(feature).any())
    # 填充缺失值
    Age = feature.pop("Age")  # 取出,意思是取出来之后删除原来的
    Age = Age.fillna(Age.mean())
    # print(feature)
    # feature.drop("Age", axis=1, inplace=True)  # 删除一列
    feature.insert(0, "Age", Age)
    # print(pd.isnull(feature).any())

    # 字典特征抽取
    dv = DictVectorizer()
    feature = dv.fit_transform(feature.to_dict(orient="records"))
    feature = feature.toarray()
    print(feature)
    print(dv.get_feature_names())

    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)
    print("训练集:", x_train.shape, y_train.shape)
    print("验证集:", x_val.shape, y_val.shape)
    print("测试集:", x_test.shape, y_test.shape)

    # 建立模型
    tree = DecisionTreeClassifier(max_depth=5)

    # 训练
    tree.fit(x_train, y_train)

    # 验证
    score = tree.score(x_val, y_val)
    print("在验证集上的得分:", score)

    # 预测
    score_test = tree.score(x_test, y_test)
    print("在测试集上的得分:", score_test)
    predict = tree.predict(x_test)
    print("测试结果:", predict)

    # 保存树结构
    export_graphviz(tree, out_file="E:/Desktop/开题报告/tree.dot", feature_names=['Age', 'Fare', 'Pclass', 'Sex=female', 'Sex=male'])

    # 将保存的dot文件转成png文件,查看树结构
    # dot -Tpng tree.dot -o tree.png


if __name__ == "__main__":
    de_tree()

================================================
FILE: 划分数据集.py
================================================
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split


# 将数据集划分成训练集、验证集、测试集
def split_datasets():
    iris = load_iris()
    feature = iris.data
    target = iris.target
    print("特征值:", type(feature))
    print("目标值:", type(target))
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)  # 传的参数必须是numpy.ndarray或者pandas.dataframes,但是必须是传入特征值和目标值,不能一起传入
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)
    print("训练集:", x_train.shape, y_train.shape)
    print("验证集:", x_val.shape, y_val.shape)
    print("测试集:", x_test.shape, y_test.shape)


if __name__ == "__main__":
    split_datasets()



================================================
FILE: 岭回归.py
================================================
"""
    带有正则化的线性回归
"""


import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


def ridge():
    # 加载数据集
    boston = load_boston()
    feature = boston.data
    target = boston.target

    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)

    # 标准化
    std_x = StandardScaler()
    x_train = std_x.fit_transform(x_train)
    x_test = std_x.transform(x_test)

    std_y = StandardScaler()
    y_train = std_y.fit_transform(y_train.reshape(-1, 1))
    y_test = std_y.transform(y_test.reshape(-1, 1))

    # 建立模型
    rd = Ridge(alpha=1.0)
    rd.fit(x_train, y_train)

    # 预测结果
    y_predict = rd.predict(x_test)
    y_predict_inverse = std_y.inverse_transform(y_predict)
    print(y_predict_inverse)

    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("均分误差:", error)


if __name__ == "__main__":
    ridge()


================================================
FILE: 朴素贝叶斯.py
================================================
"""
    贝叶斯分类器的主要思想:
        已知一个含有标记的数据集,此时来了一个测试样本,我们知道测试样本的特征,需要预测标记,
        若我们能够求出这个样本属于各个类别的概率,那么从中选择概率最大的就可以了,那么就是求P(c|x),
        先用全概率公式P(c|x)=P(x,c)/P(x),再用条件概率公式P(c|x)=P(x,c)/P(x)=P(x|c)*P(c)/P(x),
        对于同一个测试样本P(x)都是相同的,因此分母不是我们需要关心的。P(c)很好求,就是在数据集当中某个类别出现的概率
        最难求的就是P(x|c),朴素贝叶斯的思想就是假设各个特征之间相互独立,那么P(x|c)就等于P(x1|c)*P(x2|c)...,这样就可以求解了
"""

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


def bayes():
    # 加载数据集(文本数据集)
    news = fetch_20newsgroups()
    feature = news.data
    target = news.target
    print("特征:", len(feature))
    print("目标:", len(target))
    print("目标值的含义:", news.target_names)
    # 文本特征抽取
    tf = TfidfVectorizer()
    feature = tf.fit_transform(feature)
    feature = feature.toarray()
    print(feature.shape)
    print(feature.dtype)
    feature = feature.astype(np.uint8)
    print(feature.dtype)
    # 特征降维
    # pca
    pca = PCA(n_components=100)
    feature = pca.fit_transform(feature)
    print(feature.shape)
    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)
    print("训练集:", x_train.shape)
    print("验证集:", x_val.shape)
    print("测试集:", x_test.shape)
    # 建立贝叶斯模型
    # alapha是拉普拉斯平滑系数,防止计算的概率是0
    mlt = MultinomialNB(alpha=1.0)
    # 训练
    mlt.fit(x_train, y_train)
    # 验证
    score_val = mlt.score(x_val, y_val)
    print("在验证集上的得分:", score_val)
    # 预测
    score_test = mlt.score(x_test, y_test)
    print("在测试集上的得分:", score_test)
    predict = mlt.predict(x_test)
    print("测试结果:", predict)


if __name__ == "__main__":
    bayes()



================================================
FILE: 特征工程.py
================================================
"""
特征工程:
    特征抽取:
        字典特征抽取
        文本特征抽取
    特征预处理:
        归一化
        标准化
        缺失值处理
    特征降维:
        过滤式
        主成分分析
"""

import numpy as np
import pandas as pd


# ndarray与dataframe之间的相互转换
def nd_da():
    data = [[1, 2], [4, 5], [7, 8]]
    print(type(data))
    print("列表:", data)

    # 列表转ndarray
    nd_data = np.array(data)
    print(type(nd_data))
    print("ndarray:", nd_data)

    # ndarray转dataframe
    da_data = pd.DataFrame(nd_data)
    print(type(da_data))
    da_data.columns = ["a", "b"]
    da_data.index = ["A", "B", "C"]
    print("DataFrame:", da_data)

    # dataframe转ndarray
    np_data = np.array(da_data)
    print(type(np_data))
    print(np_data)


# 字典特征抽取:针对特征值是非数值型的特征,进行one_hot编码
from sklearn.feature_extraction import DictVectorizer
def dictvec():
    data = [["北京", 12], ["上海", 50], ["深圳", 100], ["宣城", 1000]]
    data = np.array(data)  # 将列表转成numpy.ndarray
    data = pd.DataFrame(data)  # 将ndarray转成dataframe
    data.columns = ["city", "people"]
    print("dataframe:", data)
    print(type(data))
    dict = DictVectorizer(sparse=False)
    result = dict.fit_transform(data.to_dict(orient="records"))  # 必须这样传,将dataframe转成字典
    print("字典特征抽取之后的结果:", result)
    print(dict.get_feature_names())


# 文本特征抽取:针对特征值是文本的情况
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
def text():
    data = ["life is short, I like python", "life is too long, I dislike python"]
    # 统计次数
    cv = CountVectorizer()
    result = cv.fit_transform(data)  # 默认返回稀疏矩阵
    result = result.toarray()  # 将稀疏矩阵转成密集矩阵
    print(result)
    print(type(result))
    print(cv.get_feature_names())
    # 统计重要性
    tf = TfidfVectorizer()
    result_tf = tf.fit_transform(data)
    result_tf = result_tf.toarray()
    print(result_tf)
    print(type(result_tf))
    print(tf.get_feature_names())


# 特征预处理
# 缺失值处理
def missing_value():
    data1 = pd.DataFrame({"一班": [90, 80, 66, 75, 99, 55, 76, 78, 98, None, 90],
                          "二班": [75, 98, 100, None, 77, 45, None, 66, 56, 80, 57],
                          "三班": [45, 89, 77, 67, 65, 100, None, 75, 64, 88, 99],
                          "四班": [45, 89, 77, 67, 65, 100, 45, 75, 64, 88, 99]})

    data2 = pd.DataFrame({"一班": [90, 80, 66, 75, 99, 55, 76, 78, 98, np.nan, 90],
                          "二班": [75, 98, 100, np.nan, 77, 45, np.nan, 66, 56, 80, 57],
                          "三班": [45, 89, 77, 67, 65, 100, np.nan, 75, 64, 88, 99],
                          "四班": [45, 89, 77, 67, 65, 100, 45, 75, 64, 88, 99]})

    data3 = pd.DataFrame({"一班": [90, 80, 66, 75, 99, 55, 76, 78, 98, "null", 90],
                          "二班": [75, 98, 100, "null", 77, 45, "null", 66, 56, 80, 57],
                          "三班": [45, 89, 77, 67, 65, 100, "null", 75, 64, 88, 99],
                          "四班": [45, 89, 77, 67, 65, 100, 45, 75, 64, 88, 99]})
    # 缺失值是None、np.nan都是可以识别出来的,打印的时候显示NaN,但是其他类型的缺失值是无法识别的
    # print(data1)
    # print(data2)
    # print(data3)
    # 如果遇到的是data3这种类型的缺失值,那么首先要用np.nan替换掉缺失值
    data3.replace("null", np.nan, inplace=True)

    # 判断有没有缺失值
    print(pd.isnull(data3).any())

    # 处理缺失值
    # 删除
    # data3.dropna(axis=0, how="any", inplace=True)
    # print(data3)
    # 填充
    data3.fillna(data3.mean(), inplace=True)
    print(data3)

# 归一化
from sklearn.preprocessing import MinMaxScaler
def min_max():
    data = [[1, 2, 3],[4, 5, 6],[7, 8, 9]]
    mm = MinMaxScaler(feature_range=(0, 1))
    result = mm.fit_transform(data)
    print(result)

# 标准化
from sklearn.preprocessing import StandardScaler
def standard():
    data = [[1, 2, 3],[4, 5, 6],[7, 8, 9]]
    std = StandardScaler()
    result = std.fit_transform(data)
    print(result)


# 特征抽取:过滤式
from sklearn.feature_selection import VarianceThreshold
def var():
    data = [[1, 2, 3],[1, 4, 5],[1, 7, 8]]
    v = VarianceThreshold(threshold=0)
    result = v.fit_transform(data)
    print(result)


# 特征降维:PCA
from sklearn.decomposition import PCA
def pca():
    data = [[1, 2, 4, 5], [4, 5, 4, 2], [2, 4, 1, 4]]
    p = PCA(n_components=0.95)
    result = p.fit_transform(data)
    print(result)


if __name__ == "__main__":
    nd_da()
    dictvec()
    text()
    missing_value()
    min_max()
    standard()
    var()
    pca()




================================================
FILE: 线性回归.py
================================================
"""
    线性回归:通过构建线性模型来进行预测的一种回归算法
"""


import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error


def linear():
    # 加载数据集
    boston = load_boston()
    feature = boston.data
    target = boston.target

    # 划分数据集
    x_train, x_test, y_train, y_test_ori = train_test_split(feature, target, test_size=0.25)

    # 标准化
    std1 = StandardScaler()
    x_train = std1.fit_transform(x_train)
    x_test = std1.transform(x_test)

    std2 = StandardScaler()
    y_train = std2.fit_transform(y_train.reshape(-1, 1))  # 必须传二维
    y_test = std2.transform(y_test_ori.reshape(-1, 1))

    # 正规方程的解法
    # 建立模型
    lr = LinearRegression()  # 通过公式求解
    lr.fit(x_train, y_train)

    # 预测结果
    y_predict = lr.predict(x_test)  # 这个结果是标准化之后的结果,需要转换
    y_predict_inverse = std2.inverse_transform(y_predict)
    print(y_predict_inverse)

    # 均方误差
    error = mean_squared_error(y_test_ori, y_predict_inverse)
    print("均方误差:", error)

    # 梯度下降算法求解
    sgd = SGDRegressor()  # 通过梯度下降求解
    sgd.fit(x_train, y_train)

    # 预测结果
    y_predict_sgd = sgd.predict(x_test)
    y_predict_sgd_inverse = std2.inverse_transform(y_predict_sgd)  # 反归一化
    print("sgd预测结果:", y_predict_sgd_inverse)

    # 均方误差
    error_sgd = mean_squared_error(y_test_ori, y_predict_sgd_inverse)
    print("sgd的均分误差:", error_sgd)


if __name__ == "__main__":
    linear()


================================================
FILE: 逻辑回归.py
================================================
"""
    逻辑回归:将线性回归函数的输出,作为Sigmoid函数的输入,然后输出为0-1之间的
"""


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


def LR():
    # 加载数据集
    cancer = pd.read_csv("E:/Desktop/机器学习_新/数据集/癌症数据集/Prostate_Cancer.csv")
    pd.set_option("display.max_columns", 100)
    print(cancer.head(5))
    print("特征值名称:", list(cancer.columns))

    # 提取特征值和目标值
    feature = cancer[list(cancer.columns)[2:]]
    print(feature.head(5))
    target = cancer[list(cancer.columns)[1]]
    print(target.head(5))

    # 将目标值进行0-1化
    target.replace("M", 0, inplace=True)
    target.replace("B", 1, inplace=True)
    print(target.head(5))

    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25)
    print("训练集:", x_train.shape, y_train.shape)
    print("验证集:", x_val.shape, y_val.shape)
    print("测试集:", x_test.shape, y_test.shape)

    # 标准化
    std = StandardScaler()
    x_train = std.fit_transform(x_train)
    x_val = std.transform(x_val)
    x_test = std.transform(x_test)

    # 建立模型
    lg = LogisticRegression()
    # 训练
    lg.fit(x_train, y_train)
    # 验证
    score_val = lg.score(x_val, y_val)
    print("在验证集上的得分:", score_val)
    # 测试
    score_test = lg.score(x_test, y_test)
    print("在测试集上的得分:", score_test)
    # 预测
    predict = lg.predict(x_test)
    print(predict)
    # 打印召回率,F1
    print(classification_report(y_test, predict, labels=[0, 1], target_names=["良性", "恶性"]))


if __name__ == "__main__":
    LR()


================================================
FILE: 随机森林.py
================================================
"""
    随机森林是一种同质的集成学习算法,通过构建多个决策树,然后结合多个决策树的结果,得到更好的预测
"""

import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


def forest():
    # 加载数据
    titan = pd.read_csv("E:/Desktop/机器学习_新/数据集/泰坦尼克数据集/train.csv")

    # 构造特征值和目标值
    feature = titan[["Pclass", "Age", "Fare", "Sex"]]
    target = titan["Survived"]

    # 特征预处理
    # 查看有没有缺失值
    print(pd.isnull(feature).any())
    # 填充缺失值
    Age = feature.pop("Age")  # 取出,意思是取出来之后删除原来的
    Age = Age.fillna(Age.mean())
    feature.insert(0, "Age", Age)

    # 字典特征抽取
    dv = DictVectorizer()
    feature = dv.fit_transform(feature.to_dict(orient="records"))
    feature = feature.toarray()
    print(feature)
    print(dv.get_feature_names())

    # 划分数据集
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.25)
    print("训练集:", x_train.shape, y_train.shape)
    print("测试集:", x_test.shape, y_test.shape)

    # 建立模型
    rf = RandomForestClassifier()

    # 超参数搜索
    param = {"n_estimators":[10, 20, 30, 40], "max_depth":[25, 35, 45]}
    gc = GridSearchCV(rf, param_grid=param, cv=5)

    # 训练
    gc.fit(x_train, y_train)

    # 交叉验证网格搜索的结果
    print("在测试集上的准确率:", gc.score(x_test, y_test))
    print("在验证集上的准确率:", gc.best_score_)
    print("最好的模型参数:", gc.best_params_)
    print("最好的模型:", gc.best_estimator_)


if __name__ == "__main__":
    forest()

Download .txt
gitextract_kub6qg91/

├── KNN.py
├── README.md
├── SVM.py
├── sklearn数据集获取.py
├── 决策树.py
├── 划分数据集.py
├── 岭回归.py
├── 朴素贝叶斯.py
├── 特征工程.py
├── 线性回归.py
├── 逻辑回归.py
└── 随机森林.py
Download .txt
SYMBOL INDEX (20 symbols across 11 files)

FILE: KNN.py
  function knn (line 14) | def knn():

FILE: SVM.py
  function svm (line 13) | def svm():

FILE: sklearn数据集获取.py
  function iris_datasets (line 5) | def iris_datasets():
  function boston_datasets (line 19) | def boston_datasets():
  function newsgroups (line 33) | def newsgroups():

FILE: 决策树.py
  function de_tree (line 13) | def de_tree():

FILE: 划分数据集.py
  function split_datasets (line 6) | def split_datasets():

FILE: 岭回归.py
  function ridge (line 14) | def ridge():

FILE: 朴素贝叶斯.py
  function bayes (line 18) | def bayes():

FILE: 特征工程.py
  function nd_da (line 20) | def nd_da():
  function dictvec (line 45) | def dictvec():
  function text (line 61) | def text():
  function missing_value (line 81) | def missing_value():
  function min_max (line 116) | def min_max():
  function standard (line 124) | def standard():
  function var (line 133) | def var():
  function pca (line 142) | def pca():

FILE: 线性回归.py
  function linear (line 14) | def linear():

FILE: 逻辑回归.py
  function LR (line 13) | def LR():

FILE: 随机森林.py
  function forest (line 12) | def forest():
Condensed preview — 12 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (24K chars).
[
  {
    "path": "KNN.py",
    "chars": 1413,
    "preview": "\"\"\"\n    KNN算法也叫做K近邻算法,它的主要思想是:\n        计算测试样本与训练集中各个样本之间的距离,选择与测试样本距离最近的K个,然后统计这K个样本中出现标记最多的那个,\n        将这个标记作为测试样本的标记\n\""
  },
  {
    "path": "README.md",
    "chars": 78,
    "preview": "# sklearn_mochine_learning\n使用sklearn实现机器学习的算法,包括了线性回归、岭回归、逻辑回归、朴素贝叶斯、决策树、随机森林\n"
  },
  {
    "path": "SVM.py",
    "chars": 1253,
    "preview": "\"\"\"\n    支持向量机:通过寻找划分超平面来进行分类的算法,这个划分超平面只由支持向量有关,与其他样本无关\n\"\"\"\n\n\nimport pandas as pd\nfrom sklearn.datasets import load_iris"
  },
  {
    "path": "sklearn数据集获取.py",
    "chars": 1246,
    "preview": "from sklearn.datasets import load_iris, load_boston, fetch_20newsgroups\n\n\n# 加载鸢尾花数据集(分类数据集)\ndef iris_datasets():\n    iri"
  },
  {
    "path": "决策树.py",
    "chars": 2025,
    "preview": "\"\"\"\n    通过树形结构来实现分类的一种算法,关键在于如何选择最优属性\n    通常用三种方式:信息增益(ID3)、增益率(C4.5)、基尼系数(CART)\n\"\"\"\n\nimport pandas as pd\nimport numpy a"
  },
  {
    "path": "划分数据集.py",
    "chars": 705,
    "preview": "from sklearn.datasets import load_iris\nfrom sklearn.model_selection import train_test_split\n\n\n# 将数据集划分成训练集、验证集、测试集\ndef s"
  },
  {
    "path": "岭回归.py",
    "chars": 1074,
    "preview": "\"\"\"\n    带有正则化的线性回归\n\"\"\"\n\n\nimport pandas as pd\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_selection impor"
  },
  {
    "path": "朴素贝叶斯.py",
    "chars": 1869,
    "preview": "\"\"\"\n    贝叶斯分类器的主要思想:\n        已知一个含有标记的数据集,此时来了一个测试样本,我们知道测试样本的特征,需要预测标记,\n        若我们能够求出这个样本属于各个类别的概率,那么从中选择概率最大的就可以了,那么"
  },
  {
    "path": "特征工程.py",
    "chars": 4303,
    "preview": "\"\"\"\n特征工程:\n    特征抽取:\n        字典特征抽取\n        文本特征抽取\n    特征预处理:\n        归一化\n        标准化\n        缺失值处理\n    特征降维:\n        过滤式"
  },
  {
    "path": "线性回归.py",
    "chars": 1556,
    "preview": "\"\"\"\n    线性回归:通过构建线性模型来进行预测的一种回归算法\n\"\"\"\n\n\nimport pandas as pd\nfrom sklearn.datasets import load_boston\nfrom sklearn.model_"
  },
  {
    "path": "逻辑回归.py",
    "chars": 1712,
    "preview": "\"\"\"\n    逻辑回归:将线性回归函数的输出,作为Sigmoid函数的输入,然后输出为0-1之间的\n\"\"\"\n\n\nimport pandas as pd\nfrom sklearn.model_selection import train_t"
  },
  {
    "path": "随机森林.py",
    "chars": 1515,
    "preview": "\"\"\"\n    随机森林是一种同质的集成学习算法,通过构建多个决策树,然后结合多个决策树的结果,得到更好的预测\n\"\"\"\n\nimport pandas as pd\nfrom sklearn.feature_extraction import "
  }
]

About this extraction

This page contains the full source code of the goodboyv/Sklearn_Mochine_leanring GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 12 files (18.3 KB), approximately 6.7k tokens, and a symbol index with 20 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!