对含有中英文的文本去除停用词 分词
这里的停用词表可以自己定义或者采用网上的
是文本分类 情感分析进行预处理的步骤
from collections import Counter
import jieba**# jieba.load_userdict('userdict.txt')
**# 创建停用词list****
def stopwordslist(filepath):stopwords = [line.strip() for line in open(filepath, 'r').readlines()]return stopwords#对句子分词
def seg_sentence(sentence):sentence_seged = jieba.cut(sentence.strip())#jieba分词对象stopwords = stopwordslist('E:\\pythonimg\\stopword.txt') # 这里加载停用词的路径 这里可以再加自定义的停用词outstr = ''for word in sentence_seged:if word not in stopwords:if word != '\t':outstr += wordoutstr += " "return outstrinputs = open('E:\\pythonimg\\comment.txt', 'r',encoding='utf-8') # 加载要处理的文件的路径
outputs = open('E:\\pythonimg\\已去除停用词.txt', 'w',encoding='utf-8') # 加载处理后的文件路径
for line in inputs:line_seg = seg_sentence(line) # 这里的返回值是字符串outputs.write(line_seg)
outputs.close()
inputs.close()
#wordcount
with open('E:\\pythonimg\\已去除停用词.txt', 'r',encoding='utf-8') as fr: # 读入已经去除停用词的文件 加载处理后的文件路径data = jieba.cut(fr.read())
data = dict(Counter(data))
测试实例