词云:
import jieba
from imageio import imread
from numpy import unicode
from wordcloud import WordCloud,ImageColorGenerator
import matplotlib.pyplot as plt
jieba.load_userdict("stoplist.txt")
"""seg_list = jieba.cut("我来到北京清华大学", cut_all=True)
print("Full Mode: " + "/ ".join(seg_list)) # 全模式seg_list = jieba.cut("我来到北京清华大学", cut_all=False)
print("Default Mode: " + "/ ".join(seg_list)) # 精确模式seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式
print(", ".join(seg_list))seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式
print(", ".join(seg_list))"""
#***************************-------test***************
back_color=imread('girl.jpg') #导入背景图片wc=WordCloud(background_color='white', #背景yansemax_words=100, #允许最大词数mask=back_color, #忽略width和heightmax_font_size=100, #显示字体的最大值font_path="D:\\pythonProject2\\simhei.ttf", #解决显示口字型乱码问题random_state=42, #为每个词返回一个PIL颜色)
text=open('data_m_content.txt').read() #打开词源文本def stop_words(texts):words_list=[]#for i in range(0,len(texts)):word_generator=jieba.cut_for_search(texts) #分词with open('stoplist.txt',encoding='utf-8') as f:str_text=f.read()unicode_text=unicode(str_text)f.close()for word in word_generator:if word.strip() not in unicode_text: #去除停用词words_list.append(word)return ' '.join(words_list)text=stop_words(text)wc.generate(text)image_colors=ImageColorGenerator(back_color) #基于彩色图像生成相应彩色
plt.imshow(wc) #显示图片
plt.axis('off') #关闭坐标轴
plt.figure() #绘制词云
plt.imshow(wc.recolor(color_func=image_colors))
plt.axis('off')
wc.to_file('data_m.png') #保存图片
线状图:
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
path="D:\\pythonProject2\\all.csv"
df=pd.read_csv(path)
df.dropna()
data1=[]matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif' #设置中文问题
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = Falsefor x in range(0,len(df['times'])):if df["times"][x][9:11]==' ':data1.append('blank')else:data1.append(df["times"][x][9:11].strip(":"))df['times']=data1
df.value_counts()
path1="D:\pythonProject2\content_hour.txt"
#count={}
with open(path1,encoding='utf-8') as f:lines=f.readlines()#print(lines)for i in range(0,len(lines)):lines[i]=lines[i].strip(' ')if len(lines[i])<3:lines[32]='no 8'#pd1=pd.DataFrame(lines)
#print(pd1)counts={'0':0,'1':0,'2':0,'3':0,'4':0,'5':0,'6':0,'7':0,'8':0,'9':0,'10':0,'11':0,'12':0,'13':0,'14':0,'15':0,'16':0,'17':0,'18':0,'19':0,'20':0,'21':0,'22':0,'23':0}data3=[]
flag=0def count(flag):a=0data2 = []for i in range(0,len(lines)):#print(lines[i][0:2])if lines[i][0:2]=='no':continueif eval(lines[i][0:2])==flag:#print(eval(lines[i][2:-1]))data2.append(eval(lines[i][2:]))#print(data2)for n in range(0,len(data2)):a=a+data2[n]return a#eval(lines[i][2:])
#print(lines)
#print(count(3))
for x in range(0,24):data3.append(count(x))
print(data3)df1=pd.DataFrame(index=counts,columns=['counts'] )
df1['counts']=data3plt.plot(df1)for m, n in zip(df1.index,df1['counts']):plt.text(m,n,n,ha='center',va='bottom',fontsize=8)#print(df1)
#print(count(1))
#print(data2)
#print(data2)
#plt.xticks(rotation=90)
plt.xlabel("时间(小时)")
plt.ylabel("评论(数量)")
plt.show()
柱状图:
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import unicode
import jiebapd.set_option('display.max_rows',None)path="D:\\pythonProject2\\all.csv"
df=pd.read_csv(path)
df.dropna()
data1=[]matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif' #设置中文问题
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = Falsedf1=df['citys'].value_counts().drop(['blank']) #blank是我清洗数据时给城市为空的赋值,因为也许其他列还有用所以没有直接删
df2=pd.DataFrame(df1[:10])df2.plot(kind='bar')#plt.plot(df2)#print(df1)
#print(data2)
#print(data2)
#plt.xticks(rotation=90)
plt.xlabel("城市")
plt.ylabel("评论(数量)")
plt.xticks(rotation=45)
plt.show()
箱型图:
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import pandas as pd
from bokeh.plotting import figure, show, output_file
#北京 828
#上海 495
#广东广州 144
#浙江杭州matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['font.family']='sans-serif' #设置中文问题
#解决负号'-'显示为方块的问题
matplotlib.rcParams['axes.unicode_minus'] = Falsedata1=[]
data2=[]
data3=[]
data4=[]df= pd.read_csv('all.csv')
#df1=df['citys'].value_counts().drop(['blank']) #blank是我清洗数据时给城市为空的赋值,因为也许其他列还有用所以没有直接删
#df2=pd.DataFrame(df1[:4])
for x in range(0,len(df['scores'])): # 再次清洗去除列表数据中的''if df['citys'][x]=='北京':data1.append(float(df['scores'][x])/50)if df['citys'][x]=='上海':data2.append(float(df['scores'][x]) / 50)if df['citys'][x] == '广东广州':data3.append(float(df['scores'][x]) / 50)if df['citys'][x]=='浙江杭州':data4.append(float(df['scores'][x]) / 50)
"""for y in range(0,len(data1)):if len(data2)<len(data1):data2.append(' ')if len(data3) < len(data1):data3.append(' ')if len(data4) < len(data1):data4.append(' ')"""
print(data3)
df1=pd.DataFrame({'北京':data1})
df2=pd.DataFrame({'上海':data2})
df3=pd.DataFrame({'广东广州':data3})
df4=pd.DataFrame({'浙江杭州':data4})
print(df3)
plt.figure(figsize=(10,4))
# 创建图表、数据
def draw(df1):f = df1.boxplot(return_type='dict')plt.title('城市')for box in f['boxes']:box.set( color='b', linewidth=1) # 箱体边框颜色box.set( alpha=0.5) # 箱体内部填充颜色for whisker in f['whiskers']:whisker.set(color='k', linewidth=0.5,linestyle='-')for cap in f['caps']:cap.set(color='gray', linewidth=2)for median in f['medians']:median.set(color='DarkBlue', linewidth=2)for flier in f['fliers']:flier.set(marker='o', color='y', alpha=0.5)plt.show()
draw(df3)
# boxes, 箱线
# medians, 中位值的横线,
# whiskers, 从box到error bar之间的竖线.
# fliers, 异常值
# caps, error bar横线
# means, 均值的横线