爬取代码:
import requests
from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
import xlwt
import sqlite3
#from fake_useragent import UserAgentdef askURL(url):#head ={'User-Agent':str(UserAgent().random)}head = {"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 84.0.4147.89Safari / 537.36"}request = urllib.request.Request(url,headers=head)html = ""try:print(requests.get(url,head).status_code)response = urllib.request.urlopen(request)html= response.read().decode("utf-8")#print(html)except urllib.error.URLError as e:if hasattr(e,"code"):print(e.code)if hasattr(e, "reason"):print(e.reason)return htmldef getData(baseurl):datalist = []for i in range(0,60):print("di%d"%(i))url = baseurl +str(i)html = askURL(url)#2.逐一解析soup = BeautifulSoup(html,"html.parser")for item in soup.find_all('div',class_="a-section celwidget"):data = []item = str(item)stars = re.findall(find_star,item)data.append(stars)review_date = re.findall(find_review_date,item)[0]data.append(review_date)review = re.findall(find_review,item)[1]data.append(review)review_useful = re.findall(find_review_useful,item)data.append(review_useful)datalist.append(data)return datalist
def saveData(datalist,savepath):book = xlwt.Workbook(encoding="utf-8",style_compression=0)sheet = book.add_sheet('亚马逊商品评论',cell_overwrite_ok=True)col = ("评论等级","评论时间","评论内容","评论点赞")for i in range(0,4):sheet.write(0,i,col[i])for i in range(0,600):print("第%d条"%(i+1))data = datalist[i]for j in range(0,4):sheet.write(i+1,j,data[j])book.save(savepath)def main():baseurl = "https://www.amazon.com/-/zh/product-reviews/B010OVNAFQ/ref=cm_cr_getr_d_paging_btm_next_62?ie=UTF8&reviewerType=all_reviews&pageNumber="#1.爬取网页datalist = getData(baseurl)savepath = "亚马逊商品评论.xls"#3.保存数据saveData(datalist,savepath)if __name__== "__main__":main()
find_star = re.compile(r'<span class="a-icon-alt">(.*),最多 5 颗星</span>')#评分
find_review_date = re.compile(r'<span class="a-size-base a-color-secondary review-date" data-hook="review-date">(.*)在.*?</span>')
find_review = re.compile(r'<span>(.*?)</span>',re.S)
find_review_useful = re.compile(r'<span class="a-size-base a-color-tertiary cr-vote-text" data-hook="helpful-vote-statement">(.*?)</span>',re.S)
对爬取的评论进行词性分析
import xlrd
from nltk import word_tokenize, pos_tag
import nltk
import seaborn as sns
import matplotlib.pyplot as pltexcel = xlrd.open_workbook("亚马逊商品评论.xls") # 打开excel文件
sheet = excel.sheet_by_index(0) # 获取工作薄
rows: list = sheet.row_values(0) # 获取第一行的表头内容
index = rows.index('评论内容') # 获取age列所在的列数: 1,也可以换成"password"
listindes = sheet.col_values(index) # 获取age列的所有内容str = "".join(listindes)
str = str.replace('\n','').replace('</br>','').replace('<br/>','') # 去除字符串中的一些字符tokens = word_tokenize(str) # 进行词性分析,返回的是一个元组列表
#print(pos_tag(tokens))
list3 = []
list1 = []
dic = {}
for i in pos_tag(tokens):list3.append(i[1])list1.append(i[0])print(set(list3))
for i in set(list3):count = list3.count(i)dic[i] = count#print(i,'出现次数:',count)dic = dict(sorted(dic.items(),key=lambda x : x[1],reverse=True)) # 按照数量降序排序
dic.pop('.') # 去掉.
dic.pop(',') # 去掉,dic2 = dict(list(dic.items())[:10]) # 取前10的数据
x = list(dic2.keys())
y = list(dic2.values())
sns.barplot(x = x, y = y)
plt.xticks(rotation = 45)
plt.show()