这篇文章,我们就来讲讲怎样爬取歌单,并且播放量从高到低排列,下面是爬取结果
一 核心代码如下
1.需要导入的包有
from urllib import parse
from lxml import etree
from urllib3 import disable_warnings
import requests
2.设置请求头部信息,获取header
self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0","Referer": "https://music.163.com/","Upgrade-Insecure-Requests": '1',}
3.设置请求表格信息
def set_froms(self):self.key = parse.quote(self.types)self.froms = {"cat": self.key,"order": self.years,"limit": self.limit,"offset": self.offset,}return self.froms
4.解析代码,获取有用的数据
def parsing_codes(self):page = etree.HTML(self.code)# 标题self.title = page.xpath('//div[@class="u-cover u-cover-1"]/a[@title]/@title')# 作者self.author = page.xpath('//p/a[@class="nm nm-icn f-thide s-fc3"]/text()')# 阅读量self.listen = page.xpath('//span[@class="nb"]/text()')# 歌单链接self.link = page.xpath('//div[@class="u-cover u-cover-1"]/a[@href]/@href')# 打印看看for i in zip(self.title, self.link, self.author, self.listen):print("[歌单名称]:{}\n[发布作者]:{}\n[总播放量]:{}\n[歌单链接]:{}\n".format(i[0],i[2],i[3],"https://music.163.com/"+i[1]))print('第{}页'.format(self.pages).center(50,'='))
二 完整代码
from urllib import parse
from lxml import etree
from urllib3 import disable_warnings
import requestsclass Wangyiyun(object):def __init__(self, **kwargs):# 歌单的歌曲风格self.types = kwargs['types']# 歌单的发布类型self.years = kwargs['years']# 这是当前爬取的页数self.pages = pages# 这是请求的url参数(页数)self.limit = 35self.offset = 35 * self.pages - self.limit# 这是请求的urlself.url = "https://music.163.com/discover/playlist/?"# 设置请求头部信息(可扩展:不同的User - Agent)def set_header(self):self.header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0","Referer": "https://music.163.com/","Upgrade-Insecure-Requests": '1',}return self.header# 设置请求表格信息def set_froms(self):self.key = parse.quote(self.types)self.froms = {"cat": self.key,"order": self.years,"limit": self.limit,"offset": self.offset,}return self.froms# 解析代码,获取有用的数据def parsing_codes(self):page = etree.HTML(self.code)# 标题self.title = page.xpath('//div[@class="u-cover u-cover-1"]/a[@title]/@title')# 作者self.author = page.xpath('//p/a[@class="nm nm-icn f-thide s-fc3"]/text()')# 阅读量self.listen = page.xpath('//span[@class="nb"]/text()')# 歌单链接self.link = page.xpath('//div[@class="u-cover u-cover-1"]/a[@href]/@href')# 打印看看for i in zip(self.title, self.link, self.author, self.listen):print("[歌单名称]:{}\n[发布作者]:{}\n[总播放量]:{}\n[歌单链接]:{}\n".format(i[0],i[2],i[3],"https://music.163.com/"+i[1]))print('第{}页'.format(self.pages).center(50,'='))# 获取网页源代码def get_code(self):disable_warnings()self.froms['cat']=self.typesdisable_warnings()self.new_url = self.url+parse.urlencode(self.froms)self.code = requests.get(url = self.new_url,headers = self.header,data = self.froms,verify = False,).text# 爬取多页时刷新offsetdef multi(self ,page):self.offset = self.limit * page - self.limitif __name__ == '__main__':# 歌单的歌曲风格types = "说唱"# 歌单的发布类型:最热=hot,最新=newyears = "hot"# 指定爬取的页数pages = 3# 例子:通过pages变量爬取指定页面(多页)music = Wangyiyun(types = types,years = years,)for i in range(pages):page = i+1 # 因为没有第0页music.multi(page) # 爬取多页时指定,传入当前页数,刷新offsetmusic.set_header() # 调用头部方法,构造请求头信息music.set_froms() # 调用froms方法,构造froms信息music.get_code() # 获取当前页面的源码music.parsing_codes() # 处理源码,获取指定数据