使用bs4,requests,re库完成对百度文库部分格式文件的爬取
案例中的目标文档地址: https://wenku.baidu.com/view/cbb4af8b783e0912a3162a89.html?from=search
案例中的个ppt地址: https://wenku.baidu.com/view/f74952272bf90242a8956bec0975f46527d3a703.html?from=search
效果展示:
代码:
import requests
import bs4
import re
import os
import jsonsession = requests.session()#发送请求获取内容
def fetch_url(url):return session.get(url).content.decode('gbk')#获取id
def get_doc_id(url):return re.findall('view/(.*).html',url)[0]#获取文档类型
def parser_type(content):return re.findall(r"docType.*?\:.*?\'(.*?)\'\,",content)[0]#获取标题
def parser_title(content):return re.findall(r"title.*?\:.*?\'(.*?)\'\,",content)[0]#md5
def parser_txt(doc_id):content_url = 'https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=' + doc_idcontent = fetch_url(content_url)#md5值md5 = re.findall('"md5sum":"(.*?)"',content)[0]#页码pn = re.findall('"totalPageNum":"(.*?)"',content)[0]print(pn)#rsignrsign = re.findall('"rsign":"(.*?)"',content)[0]content_urls = 'https://wkretype.bdimg.com/retype/text/' + doc_id + '?rn=' + pn + '&type=txt' + md5 + '&rsign=' + rsign content = json.loads(fetch_url(content_urls))result =''for item in content:for i in item['parags']:result += i['c'].replace('\\r','\r').replace('\\n','\n')return resultdef save_file(filename,content):with open(filename,'w',encoding='utf-8') as f:f.write(content)print('已保存为: ' + filename)#将其他格式的文件保存为图片到本地
def parse_other(doc_id):content_url = "http://wenku.baidu.com/browse/getbcsurl?doc_id=" + doc_id + "&pn=1&rn=99999&type=ppt"content = fetch_url(content_url)url_list = re.findall('{"zoom":"(.*?)","page"',content)url_list = [item.replace("\\",'')for item in url_list]if not os.path.exists(doc_id):os.mkdir(doc_id)for index,url in enumerate(url_list):content = session.get(url).contentpath = os.path.join(doc_id,str(index)+'.jpg')with open(path,'wb')as f:f.write(content)print("图片保存在" + doc_id + "文件夹")# https://wkretype.bdimg.com/retype/text/cbb4af8b783e0912a3162a89?
# md5sum=6e7a10b16f3ad8d3b40ecd0dfe8d1b67&
# sign=3283bb4e6e&callback=cb&pn=1&rn=4&type=txt&rsign=p_4-r_0-s_460f0&_=1581649897081#https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id=cbb4af8b783e0912a3162a89&t=1581650253651&_=1581650253480#主函数
def main():url = 'https://wenku.baidu.com/view/cbb4af8b783e0912a3162a89.html?from=search' #txt#url = 'https://wenku.baidu.com/view/f74952272bf90242a8956bec0975f46527d3a703.html?from=search' #ppt#请求content = fetch_url(url)#获取iddoc_id = get_doc_id(url)#获取文档类型type = parser_type(content)#获取标题title = parser_title(content)if type =='txt':result = parser_txt(doc_id)save_file(title + '.txt',result)else:parse_other(doc_id)if __name__ == "__main__":main()