Python实现基于皮尔森系数的协同过滤电影推荐。
- 爬虫获取用户数据
# -*- coding: utf-8 -*-
"""
爬取豆瓣某影视的评分前100个用户,将他们的影评信息抓取下来作为movie.json
为了保证数据的可靠性,选择豆瓣电影top250 No.1的【肖申克的救赎】,热门影评的前100人作为数据
"""from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import json
import urllib
import requestspeople_names = []
people_urls = []
# 创建一个正则表达式匹配对象
r = re.compile(r'e/(.+)/')
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/74.0.3724.8 Safari/537.36','Referer': 'https://movie.douban.com/subject/26100958/comments','Connection': 'keep-alive'}print("爬取用户中 ...")# 5*20 = 100个用户,若需要修改用户数量,更改外层循环。
for i in range(0, 10):url = ("https://movie.douban.com/subject/27010768/comments?""start=" + str(i * 20) + "&limit=20&sort=new_score&status=P&percent_type=")req = urllib.request.Request(url=url, headers=headers)data = urllib.request.urlopen(req).read().decode('utf-8')# data = requests.get(url,headers=headers)bs = BeautifulSoup(data, 'html.parser')comments = bs.findAll("div", {"class": "comment"})# 将用户主页存储在people_url中for comment in comments:people_url = comment.findAll("a")[1].attrs["href"].replace("www", "movie")name = re.findall(r, people_url)[0]people_names.append(name)people_urls.append(people_url)print("爬取用户完成")final_data = {}
for i in range(0, len(people_names)):final_data.setdefault(people_names[i], {})final_data[people_names[i]]["people_url"] = people_urls[i]print("爬取用户影评中...")user_count = 1
for people_name in final_data:print("正在爬取第" + str(user_count) + "位用户" + people_name + "的影评信息")user_count += 1# 爬取该用户前90条影评for i in range(0, 6):# 获取影评后缀comment_url_suffix = ("collect?start=" + str(i * 15) + "&sort=time&rating=all""&filter=all&mode=grid")comment_url = final_data[people_name]["people_url"] + comment_url_suffixreq = urllib.request.Request(url=comment_url, headers=headers)comment_data = urllib.request.urlopen(req).read().decode('utf-8')bs = BeautifulSoup(comment_data, 'html.parser')infos = bs.find("div", {"class": "grid-view"}).findAll("div", {"class": "info"})for info in infos:movie_name = info.em.get_text() # 从em标签提取try:movie_rate = re.search("[0-9]", info.findAll("li")[2].span.attrs["class"][0]).group()except:continuetry:movie_comment = info.find("span", {"class": "comment"}).get_text()except:movie_comment = ""final_data[people_name].setdefault("movies", {})final_data[people_name]["movies"].setdefault(movie_name, {})final_data[people_name]["movies"][movie_name]["movie_rate"] = movie_ratefinal_data[people_name]["movies"][movie_name]["movie_comment"] = movie_commentprint("爬取用户影评完成")file = open('movie_data.json', 'w', encoding='utf-8')
json.dump(final_data, file, ensure_ascii=False)
file.close()
file = open('movie_data.json', 'r', encoding='utf-8')
s = json.load(file)
file.close()
- 爬虫获取待推荐用户数据(默认自己):
# -*- coding: utf-8 -*-
"""
获取本人豆瓣影评信息,通过此信息分析个人喜好,寻找与我品味相似的用户
最后将本人的喜好也放入json文件中
"""import json
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import urllibheaders = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/74.0.3724.8 Safari/537.36','Referer': 'https://movie.douban.com/subject/26100958/comments','Connection': 'keep-alive'}
file = open('movie_data.json', 'r', encoding='utf-8')
movie_data = json.load(file)
file.close()# 这里填你的豆瓣上面的id
people_name = "204331023"
url = "https://movie.douban.com/people/"+people_name+"/"
movie_data.setdefault(people_name, {})
movie_data[people_name]["people_url"] = urlfor i in range(0, 6):comment_url_suffix = ("collect?start="+str(i*15)+"&sort=time&rating=all""&filter=all&mode=grid")comment_url = movie_data[people_name]["people_url"]+comment_url_suffixreq = urllib.request.Request(url=comment_url, headers=headers)comment_data = urllib.request.urlopen(req).read().decode('utf-8')bs = BeautifulSoup(comment_data, 'html.parser')infos = bs.find("div", {"class": "grid-view"}).findAll("div", {"class": "info"})for info in infos:movie_name = info.em.get_text()try:movie_rate = re.search("[0-9]", info.findAll("li")[2].span.attrs["class"][0]).group()except:continuetry:movie_comment = info.find("span", {"class": "comment"}).get_text()except:movie_comment = ""movie_data[people_name].setdefault("movies", {})movie_data[people_name]["movies"].setdefault(movie_name, {})movie_data[people_name]["movies"][movie_name]["movie_rate"] = movie_ratemovie_data[people_name]["movies"][movie_name]["movie_comment"] = movie_commentfile = open('movie_data.json', 'w', encoding='utf-8')
json.dump(movie_data, file, ensure_ascii=False)
file.close()
- 实现电影推荐
# -*- coding: utf-8 -*-
"""
根据皮尔森系数,找出与我相似的用户,再找这些用户最喜欢的电影
推荐20部我可能喜欢的电影
"""import json
from math import sqrtfile = open('movie_data1.json', 'r', encoding='utf-8')
movie_data = json.load(file)
file.close()
# 这里填豆瓣id
my_name = "204331023"# 返回p1和p2的皮尔逊相关系数,即两个人品味的相似度
def sim_pearson(data, p1, p2):"""计算皮尔森相似度:param data: 爬取的用户影评数据:param p1: 用户1:param p2: 用户2:return: 返回相似度"""si = {}for item in data[p1]["movies"]:if item in data[p2]["movies"]:si[item] = 1# 没有共同影评,返回0if len(si) == 0:return 0# 根据公式计算皮尔森系数n = len(si)sum1 = sum([int(data[p1]["movies"][it]["movie_rate"]) for it in si])sum2 = sum([int(data[p2]["movies"][it]["movie_rate"]) for it in si])sim1_sq = sum([pow(int(data[p1]["movies"][it]["movie_rate"]), 2) for it in si])sim2_sq = sum([pow(int(data[p2]["movies"][it]["movie_rate"]), 2) for it in si])p_sum = sum([int(data[p1]["movies"][it]["movie_rate"]) * int(data[p2]["movies"][it]["movie_rate"]) for it in si])# 计算皮尔森系数 Rnum = p_sum - (sum1 * sum2 / n)den = sqrt((sim1_sq - pow(sum1, 2) / n) * (sim2_sq - pow(sum2, 2) / n))if den == 0:return 0r = num / denreturn rdef top_matches(data, person, similarity=sim_pearson):"""找到5个相似度最高的用户:param data: 爬取的数据:param person: 用户本人:param n: 前n个最相似的用户:param similarity: 皮尔森相关系数:return: """sorted_data = {person: data[person]}min_sim = 0.5for other in data:if other == person:continueif similarity(data, person, other) >= min_sim:sorted_data[other] = data[other]print(other, sorted_data[other])return sorted_datadef get_recommendations(data1, person, n=5, similarity=sim_pearson):"""获取推荐结果:param data: 电影评分数据:param person: 待推荐用户名称:param n: 推荐条目:param similarity: 皮尔森相似度:return: 返回电影数据"""totals = {}sim_sum = {}data = top_matches(data1, person)# data = data1for other in data:if other == person: # 计算除自己以外的相似度continuesim = similarity(data, person, other)print(sim)# 将等于0或更小的项目去掉if sim <= 0:continuefor item in data[other]["movies"]:# 仅找出我未看过的电影if item not in data[person]["movies"] or data[person]["movies"][item] == 0:# Similarity * Score 相似度乘评分totals.setdefault(item, 0)totals[item] += int(data[other]["movies"][item]["movie_rate"]) * sim# Sum of similarities 总相似度sim_sum.setdefault(item, 0)sim_sum[item] += sim# print(totals)# print(sim_sum)# 创建评分列表rankings = [(total / sim_sum[item], item) for item, total in totals.items()]# 将rating排序并返回rankings.sort()rankings.reverse()# print(rankings)return rankings[0:n]if __name__ == '__main__':# 打印推荐结果for res in get_recommendations(movie_data, my_name, n=5):print(res)
使用过程中,逐一运行三个文件即可。
运行结果:
参考: https://blog.csdn.net/XYYxyy55/article/details/80487007