10 days ago

1.导入数据集

from sklearn.datasets import fetch_20newsgroups
news = fetch_20newsgroups(subset='all')

2.实现KNN算法

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
def knn(news):
    """
    k近邻对新闻数据集进行预测
    :return:
    """
    # 获取新闻的数据,20个类别
    

    # 进行数据集分割
    x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.3)

    # 对于文本数据,进行特征抽取
    tf = TfidfVectorizer()

    x_train = tf.fit_transform(x_train)

    # 不能调用fit_transform
    x_test = tf.transform(x_test)

    # estimator估计器流程
    knn = KNeighborsClassifier(n_neighbors=5)

    knn.fit(x_train, y_train)

    return knn.score(x_test, y_test)

3.实现朴素贝叶斯算法

from sklearn.naive_bayes import MultinomialNB
def nbcls(news):
    """
    朴素贝叶斯对新闻数据集进行预测
    :return:
    """
    # 获取新闻的数据,20个类别
    

    # 进行数据集分割
    x_train, x_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.3)

    # 对于文本数据,进行特征抽取
    tf = TfidfVectorizer()

    x_train = tf.fit_transform(x_train)

    # 不能调用fit_transform
    x_test = tf.transform(x_test)

    # estimator估计器流程
    mlb = MultinomialNB(alpha=1.0)

    mlb.fit(x_train, y_train)

    return mlb.score(x_test, y_test)

4.多次迭代运算求取平均值

import matplotlib.pyplot as plt
def score_avg(func,num):
    score_sum=0
    a=[]
    b=[]
    for i in range(1,num+1):
        score_sum += func(news)
        # 每10次查看一下准确率
        #if i%10==0:
        score_ave = score_sum/(i)
        a.append(i)
        b.append(score_ave)
    # 获取最终的准确率
    print(score_ave)
    # 将准确率变化过程展示出来
    plt.plot(a,b)
    plt.grid()
    plt.show()

5.运行结果比较

1)运行KNN算法

num=10
score_avg(knn,num)


从第60次开始准确率收敛到0.786附近
2)运行bayas算法

num=100
score_avg(nbcls,num)


从第55次开始准确率收敛到0.845附近

6.结论

文本分类采用bayas算法更加合适

← 【机器学习】KNN算法的使用与参数优化 【机器学习】决策树的简单实现与树的结构导出 →