import requests
from parsel import Selector
import json, os, timeclass GetBv():def __init__(self, bvid, page=1):'''bvid:视频号 eg:BV1hE411N7q2,strpage: P几的视频,默认为1 int'''self.bvid = bvidself.page = pageself.pg_dic, self.file_name, self.all_page = GetBv.get_name_pages(self) # 视频名称表单、文件夹名称、总视频数# 表头self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1','Referer': 'https://www.bilibili.com',}# 获取视频网址def get_video_audio_urls(self):url = f'https://www.bilibili.com/video/{self.bvid}?p={self.page}'r = requests.get(url)selector = Selector(text=r.text)# 视频连接video_audio = selector.xpath('/html/head/script[5]/text()').getall()video_dic = video_dic = json.loads(video_audio[0][video_audio[0].index('{'):])['data']['dash']['video'] # 找到{}才能loadsvideo_url = video_dic[0]['baseUrl']# 音频连接audio = video_dic = json.loads(video_audio[0][video_audio[0].index('{'):])['data']['dash']['audio']audio_url = audio[0]['baseUrl']return video_url, audio_url# 获取视频名称表单并创建存放视频的文件夹def get_name_pages(self):# 获取名称表单url = f'https://www.bilibili.com/video/{self.bvid}'r_ng = requests.get(url)selector = Selector(r_ng.text)video_message = selector.xpath('/html/head/script[6]/text()').getall()video_message = json.loads(video_message[0][video_message[0].index('{'):video_message[0].index('};')+1]) # 找到{}才能loadsall_page = video_message['videoData']['videos'] # 总页数/视频数page_name_dic = {} # 页数对应的名称for x in video_message['videoData']['pages']:page_name_dic[x['page']] = x['part']# 创建文件夹file_name = video_message['videoData']['title']if os.path.exists(file_name) == False: # 判断是否存在该文件夹os.mkdir(file_name)return page_name_dic, file_name,all_page# 获取video.m4s和audio.m4s格式的音频和视频def get_mp4(self):# bv_name: 视频名称bv_name = self.pg_dic[self.page]path = f'{self.file_name}/{bv_name}' # 视频路径if os.path.exists(f'{path}.mp4'): # 判断是否存在该视频print(f'已存在第{self.page}P视频——{bv_name}.mp4')else:video_url, audio_url = GetBv.get_video_audio_urls(self)# 爬取视频start = time.time()size = 0#stream参数设置成True时,它不会立即开始下载,当你使用iter_content或iter_lines遍历内容或访问内容属性时才开始下载r_video = requests.get(video_url, headers=self.headers,stream = True)chunk_size = 1024#每次块大小为1024content_size = int(r_video.headers['content-length'])#返回的response的headers中获取文件大小信息print(f"第{page}P--{bv_name}")print(f"{bv_name}_video.m4s ,文件大小:{format(content_size/chunk_size/1024,'.2f')}MB")with open(f'{path}_video.m4s', 'wb') as f:for data in r_video.iter_content(chunk_size=chunk_size):#每次只获取一个chunk_size大小f.write(data)#每次只写入data大小size = len(data)+size#'r'每次重新从开始输出,end = ""是不换行print('\r'+"已经下载:"+int(size/content_size*30)*"█"+f" 【{format(size/chunk_size/1024,'.2f')}MB】 【{format(size/content_size,'.2%')}】",end="")end = time.time()print(" 总耗时:%.2f秒"%(end-start))# 音频start = time.time()size = 0#stream参数设置成True时,它不会立即开始下载,当你使用iter_content或iter_lines遍历内容或访问内容属性时才开始下载r_audio = requests.get(audio_url, headers=self.headers,stream = True)chunk_size = 1024#每次块大小为1024content_size = int(r_audio.headers['content-length'])#返回的response的headers中获取文件大小信息print(f"{bv_name}_audio.m4s ,文件大小:{format(content_size/chunk_size/1024,'.2f')}MB")with open(f'{path}_audio.m4s', 'wb') as f:for data in r_audio.iter_content(chunk_size=chunk_size):#每次只获取一个chunk_size大小f.write(data)#每次只写入data大小size = len(data)+size#'r'每次重新从开始输出,end = ""是不换行print('\r'+"已经下载:"+int(size/content_size*30)*"█"+f" 【{format(size/chunk_size/1024,'.2f')}MB】 【{format(size/content_size,'.2%')}】",end="")end = time.time()print(" 总耗时:%.2f秒"%(end-start))# 合并视频order = f'ffmpeg -i "{path}_video.m4s" -i "{path}_audio.m4s" -codec copy "{path}.mp4"' # 路径加引号是为了解决路径不能包含空格 . 的问题os.system(order)# 删除原来的音频+视频.m4s文件os.remove(f"{path}_audio.m4s")os.remove(f"{path}_video.m4s")print(f'合并为{bv_name}.mp4')if __name__ == '__main__':bvid = 'BV1C5411H7dz' # ----视频的id----getBv = GetBv(bvid)max_page = getBv.all_page # 获取最大页数page_list = list(range(1, max_page+1)) # ---爬取第几P视频的列表-,记得加1--for page in page_list:getBv.page = pagegetBv.get_mp4()print(f'爬取完成,请在"{os.getcwd()}\\{getBv.file_name}"路径下查看')
我打包成了exe程序,有需要的朋友请到github下载:github–
Blibli-video