新闻分类器

更新时间:2024-04-28 18:39:01 阅读量: 综合文库 文档下载

说明:文章内容仅供预览,部分内容可能不全。下载后的文档,内容与下面显示的完全一致。下载之前请确认下面内容是否您想要的,是否完整无缺。

import pandas as pd import jieba #pip install jieba

df_news

pd.read_table('./data/val.txt',names=['category','theme','URL','content'],encoding='utf-8') df_news = df_news.dropna() df_news.head()

=

df_news.shape

分词:使用结吧分词器

content = df_news.content.values.tolist() print (content[1000])

content_S = [] for line in content:

current_segment = jieba.lcut(line)

if len(current_segment) > 1 and current_segment != '\\r\\n': #换行符 content_S.append(current_segment) content_S[1000]

df_content=pd.DataFrame({'content_S':content_S}) df_content.head()

stopwords=pd.read_csv(\encoding='utf-8') stopwords.head(20)

def drop_stopwords(contents,stopwords): contents_clean = []

all_words = []

for line in contents: line_clean = [] for word in line:

if word in stopwords: continue

line_clean.append(word) all_words.append(str(word)) contents_clean.append(line_clean) return contents_clean,all_words #print (contents_clean)

contents = df_content.content_S.values.tolist() stopwords = stopwords.stopword.values.tolist()

contents_clean,all_words = drop_stopwords(contents,stopwords)

#df_content.content_S.isin(stopwords.stopword)

#df_content=df_content[~df_content.content_S.isin(stopwords.stopword)] #df_content.head()

df_content=pd.DataFrame({'contents_clean':contents_clean}) df_content.head()

df_all_words=pd.DataFrame({'all_words':all_words}) df_all_words.head()

words_count=df_all_words.groupby(by=['all_words'])['all_words'].agg({\words_count=words_count.reset_index().sort_values(by=[\words_count.head()

from wordcloud import WordCloud import matplotlib.pyplot as plt %matplotlib inline import matplotlib

matplotlib.rcParams['figure.figsize'] = (10.0, 5.0)

wordcloud=WordCloud(font_path=\80)

word_frequence = {x[0]:x[1] for x in words_count.head(100).values} wordcloud=wordcloud.fit_words(word_frequence) plt.imshow(wordcloud)

TF-IDF :提取关键词

import jieba.analyse index = 2400

print (df_news['content'][index])

content_S_str = \

print (\ \

LDA :主题模型

格式要求:list of list形式,分词好的的整个语料 from gensim import corpora, models, similarities import gensim

#http://radimrehurek.com/gensim/

#做映射,相当于词袋

dictionary = corpora.Dictionary(contents_clean)

corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]

lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20) #类似Kmeans自己指定K值

#一号分类结果

print (lda.print_topic(1, topn=5))

for topic in lda.print_topics(num_topics=20, num_words=5):

本文来源:https://www.bwwdw.com/article/7azg.html

Top