分析一波
爬取的地址:https://movie.douban.com/subject/26588308/comments
分别找出好评、一般、差评的评论:
- 通过地址栏分析,评论的类型和percent_type有关:
- 好评为h
- 一般为m
- 差评为l
但是我们想找的是在全部里面寻找好评一般和差评,所以这种方法就不适用了。
但是我们发现每个评论都会有小星星,星星的个数就可以区分评论的类型了。
- 好评为5颗星或4颗星
- 一般为3颗星
- 差评为2颗星或者1颗星
这里对应的标签为span(class就是星星的个数)
所以可以获取到每一个comment-item,然后判断里面allstarXX是多少,再把评论分类就可以了。
代码
#找出10页里的好评,一般或差评
import urllib.request
from bs4 import BeautifulSoup
import timeabsolute = "https://movie.douban.com/subject/26588308/comments"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36',
}
comment_list_h = [] #好评
comment_list_m = [] #一般
comment_list_l = [] #差评#解析html
def get_data(html):soup = BeautifulSoup(html,'lxml')if soup.string != None:return 0else:div = soup.findAll(name="div",attrs={"class": "comment-item"}) #这里使用find_all不行for each in div:if each.find("span",attrs={"class":"allstar50"}) or each.find("span",attrs={"class":"allstar40"}):textword_h = each.find("span",attrs={"class":"short"}).textcomment_list_h.append(textword_h)if each.find("span",attrs={"class":"allstar30"}):textword_m = each.find("span", attrs={"class": "short"}).textcomment_list_m.append(textword_m)if each.find("span",attrs={"class":"allstar20"}) or each.find("span",attrs={"class":"allstar10"}):textword_l = each.find("span", attrs={"class": "short"}).textcomment_list_l.append(textword_l)#获取HTML
def get_html(absolute,i):url = absolute + '?start=' + str(i) + '&limit=20&status=P&sort=new_score'print(url)request = urllib.request.Request(url=url, headers=headers)html = urllib.request.urlopen(request).read().decode("UTF-8")flag = get_data(html)if flag == 0:return 0#将数据写入文件
def save_txt(h,m,l):with open("comment_type.txt","w",newline='',encoding="utf-8") as f:j = 1f.write('好评:')f.write("\n")for i in h:f.write('('+ str(j) + ')' +i)f.write("\n")j+=1f.write('一般:')f.write("\n")k = 1for i in m:f.write('(' + str(k) + ')' + i)f.write("\n")k += 1f.write('差评:')f.write("\n")p = 1for i in l:f.write('(' + str(p) + ')' + i)f.write("\n")p += 1if __name__ == '__main__':i = 0for j in range(0,10):flag = get_html(absolute,i)time.sleep(2)i += 20if flag==0:breaksave_txt(comment_list_h,comment_list_m,comment_list_l)