Python3文本聚类怎样进行分类操作 - 行业资讯 - 肥雀云

　　介绍

这篇文章给大家分享的是有关Python3文本聚类怎样进行分类操作的内容。小编觉得挺实用的,因此分享给大家做个参考。一起跟随小编过来看看吧。

VSM(向量空间模型)

,#引入基础库,在网上抄的代码,除了1、2、6,其他的可能用不到　　,import numpy as np 　　,import pandas as pd 　　import 再保险　　import 操作系统　　import 编解码器　　import jieba 　　,#打开文件,文件在桌面上,可以自行修改路径　　,f1=开放(“C:/用户/KangB/桌面/wechat7/title.txt",“r",编码=& # 39;gb2312 # 39;,错误=& # 39;忽略# 39;) 　　,f2=开放(“C:/用户/KangB/桌面/wechat7/title_fenci.txt", & # 39; w # 39;,编码=& # 39;gb2312 # 39;,错误=& # 39;忽略# 39;) 　　,for line f1拷贝:=,,seg_list jieba.cut(线,,cut_all=False) 　　,f2.write ((,,, . join (seg_list)) .replace (“\ t \ \ t",“\ t")) 　　,#打印(w) 　　,f1.close () 　　,f2.close () 　　,#取需要分词的内容　　,标题=开放(“C:/用户/KangB/桌面/wechat7/title_fenci.txt",编码=& # 39;gb2312 # 39;,错误=& # 39;忽略# 39;).read () .split (& # 39; \ n # 39;) 　　,#查看内容,这里是一个列表,列表里面每个原素是分好的标题,查看下长度看有没有错误　　,#标题　　,# len(标题) 　　,#构建停词函数,停词表是自己在网上搜的　　,def get_custom_stopwords (stop_words_file): 　　,with 开放(stop_words_file编码=& # 39;utf - 8 # 39;) as f: 　　,stopwords=f.read () 　　,stopwords_list=stopwords.split (& # 39; \ n # 39;) 　　,custom_stopwords_list=[小姐:for 小姐:拷贝stopwords_list] 　　return custom_stopwords_list 　　,#停用词函数调用　　,stop_words_file=癈:/用户/KangB/桌面/wechat7/stopwords.txt" 　　,stopwords=get_custom_stopwords (stop_words_file) 　　,#查看停用的词,也格是列表式　　,# stopwords 　　,#构建词向量,也就是把分好的次去除停词转化成kmeans可以接受的形式　　,得到sklearn.feature_extraction.text import CountVectorizer 　　,count_vec=CountVectorizer (stop_words=stopwords) 　　,km_matrix=count_vec.fit_transform(标题) 　　,打印(km_matrix.shape) 　　,#查看词向量　　,#打印(km_matrix.toarray ()) 　　,#开始聚类啦　　,得到sklearn.cluster import KMeans=,num_clusters 4, #聚为四类,可根据需要修改=,,km KMeans (n_clusters=num_clusters) 　　,km.fit (km_matrix)=,,clusters km.labels_.tolist () 　　,#查看聚类的结果,是列表,这里省略,看看长度是不是和标题一样就行啦　　,# len(集群) 　　,#最后把聚类结果写在一个新的txt里面　　,f3 =开放(“C:/用户/KangB/桌面/wechat7/title_clusters.txt",, & # 39; w # 39;,编码=& # 39;gb2312 # 39;,错误=& # 39;忽略# 39;) 　　,for 小姐:拷贝;集群: 　　,f3.write (str (i)) 　　,f3.write (“\ n") 　　之前,f3.close ()