文章目录
- 一、前期准备
- 二、代码实施
一、前期准备
- 观察页面确定爬取步骤
本节打算使用前节学习的 selenium 实现自动爬取
观察页面确定搜索框和搜索按钮,通过键入“蔡徐坤 篮球”,跳转到我们需要爬取的页面
from bs4 import BeautifulSoup
from selenium import webdriver # 引入浏览器驱动
from selenium.webdriver.common.by import By
# 这两个指定使用显示等待
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as Ec
# 拿到浏览器对象
browser = webdriver.Chrome()
browser.get('https://www.bilibili.com/')
# 指定最长等待时间
WAIT = WebDriverWait(browser,10)
# 这里使用 By.XPATH 和 By.CSS_SELECTOR 一样
input = WAIT.until(Ec.presence_of_element_located((By.XPATH,'//*[@id="banner_link"]/div/div/form/input')))
# input = WAIT.until(Ec.presence_of_element_located((By.CSS_SELECTOR,'#banner_link > div > div > form > input')))
submit = WAIT.until(Ec.element_to_be_clickable((By.XPATH,'//*[@id="banner_link"]/div/div/form/button')))input.send_keys('蔡徐坤 篮球')
submit.click()
如果这里突然弹出一个登录框,将搜索按钮给挡住了
这里手动刷一下可以,也可以在代码中加一个点击“首页”操作,进行刷新,这样也可以实现接下来的搜索操作
import requests
from bs4 import BeautifulSoup
from selenium import webdriver # 引入浏览器驱动
from selenium.webdriver.common.by import By
# 这两个指定使用显示等待
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as Ec# 拿到浏览器对象
browser = webdriver.Chrome()
browser.get('https://www.bilibili.com/')WAIT = WebDriverWait(browser,10)
# 如果被那个登陆框挡住了,借助 "首页" 按钮实现刷新操作
index = WAIT.until(Ec.element_to_be_clickable((By.CSS_SELECTOR,'#primary_menu > ul > li.home > a > div')))
index.click()input = WAIT.until(Ec.presence_of_element_located((By.XPATH,'//*[@id="banner_link"]/div/div/form/input')))
submit = WAIT.until(Ec.element_to_be_clickable((By.XPATH,'//*[@id="banner_link"]/div/div/form/button')))input.send_keys('蔡徐坤 篮球')
submit.click()
- 确定爬取内容
标题,视频链接,观看次数,弹幕数量,上传时间,up 主
二、代码实施
- 需要引用的模块
from bs4 import BeautifulSoup# 防止由于页面刷新不出来,所以引入超时模块捕获异常
from selenium.common.exceptions import TimeoutExceptionfrom selenium import webdriver # 引入浏览器驱动
from selenium.webdriver.common.by import By # 借助 By 模块进行使用# 这两个指定使用显示等待
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as Ec# 使用Excel进行读写操作
import xlwt
import xlrd
from xlutils.copy import copy
- 拿到总共有多少视频页数,同时获取第一页的网页内容
def search(url,name):try:print('开始访问b 站....')browser.get(url)# 如果被那个登陆框挡住了,借助 "首页" 按钮实现刷新操作index = WAIT.until(Ec.element_to_be_clickable((By.CSS_SELECTOR, '#primary_menu > ul > li.home > a > div')))index.click()input = WAIT.until(Ec.presence_of_element_located((By.XPATH, '//*[@id="banner_link"]/div/div/form/input')))# input = WAIT.until(Ec.presence_of_element_located((By.CSS_SELECTOR,'#banner_link > div > div > form > input')))submit = WAIT.until(Ec.element_to_be_clickable((By.XPATH, '//*[@id="banner_link"]/div/div/form/button')))input.send_keys('蔡徐坤 篮球')submit.click()# 跳转到新的窗口print('跳转到新窗口')all_h = browser.window_handlesbrowser.switch_to.window(all_h[1]) # 获取到第二个窗口# 处理获取的网页内容get_source(browser.page_source,name)# 获取最大页数total = WAIT.until(Ec.presence_of_element_located((By.CSS_SELECTOR, '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.last > button')))return int(total.text)# 由于网速的原因,所以刷新网页可能需要时间,所以设置超时按钮,超时的话迭代执行该操作except TimeoutException:return search()
- 获取下一页内容
def next_page(page,name):try:print('获取第'+ str(page)+'页数据:')next_btn = WAIT.until(Ec.element_to_be_clickable((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button')))next_btn.click()# 判断这段文本是否和传入的页码 page 相匹配WAIT.until(Ec.text_to_be_present_in_element((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.active > button'),str(page)))get_source(browser.page_source,name)# 有时候可能网速卡,在刷到某一页的时候数据加载不出来# 可以来个异常捕,刷新一下,再递归一下except TimeoutException:browser.refresh()return next_page(page,name)
这里需要注意需要判断传入的页码和实际所在的页码是否匹配
注意确保传入的 page 是 str 类型,因此强制转换下
Ec.text_to_be_present_in_element 用来检测某段文本是否存在某元素中
4. 完整代码
from bs4 import BeautifulSoup
# 防止由于页面刷新不出来,所以引入超时模块捕获异常
from selenium.common.exceptions import TimeoutException
from selenium import webdriver # 引入浏览器驱动
from selenium.webdriver.common.by import By # 借助 By 模块进行使用
# 这两个指定使用显示等待
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as Ec
# 使用Excel进行读写操作
import xlwt
import xlrd
from xlutils.copy import copybrowser = webdriver.Chrome()
# 指定显示等待的最长等待时间
WAIT = WebDriverWait(browser, 10)
# 执行搜索操作,拿到操作内容
def search(url,name):try:print('开始访问b 站....')browser.get(url)# 如果被那个登陆框挡住了,借助 "首页" 按钮实现刷新操作index = WAIT.until(Ec.element_to_be_clickable((By.CSS_SELECTOR, '#primary_menu > ul > li.home > a > div')))index.click()input = WAIT.until(Ec.presence_of_element_located((By.XPATH, '//*[@id="banner_link"]/div/div/form/input')))# input = WAIT.until(Ec.presence_of_element_located((By.CSS_SELECTOR,'#banner_link > div > div > form > input')))submit = WAIT.until(Ec.element_to_be_clickable((By.XPATH, '//*[@id="banner_link"]/div/div/form/button')))input.send_keys('蔡徐坤 篮球')submit.click()# 跳转到新的窗口print('跳转到新窗口')all_h = browser.window_handlesbrowser.switch_to.window(all_h[1]) # 获取到第二个窗口# 处理获取的网页内容get_source(browser.page_source,name)# 获取最大页数total = WAIT.until(Ec.presence_of_element_located((By.CSS_SELECTOR, '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.last > button')))return int(total.text)except TimeoutException:return search()def next_page(page,name):try:print('获取第'+ str(page)+'页数据:')next_btn = WAIT.until(Ec.element_to_be_clickable((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button')))next_btn.click()# 判断这段文本是否和传入的页码 page 相匹配WAIT.until(Ec.text_to_be_present_in_element((By.CSS_SELECTOR,'#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.active > button'),str(page)))get_source(browser.page_source,name)# 有时候可能网速卡,在刷到某一页的时候数据加载不出来# 可以来个异常捕,刷新一下,再递归一下except TimeoutException:browser.refresh()return next_page(page,name)# 对爬取的结果进行解析
def get_source(html,name):soup = BeautifulSoup(html,'lxml')rows = soup.find(class_='video-list clearfix').find_all('li')boards = []for row in rows:board = {}board['title'] = row.find('a').get('title')board['href'] = row.find('a').get('href')board['watch-num'] = row.find(class_='so-icon watch-num').textboard['icon-time'] = row.find(class_='so-icon time').textboard['icon-hide'] = row.find(class_='so-icon hide').textboard['up-name'] = row.find(class_='up-name').stringboards.append(board)write_excel(boards, name)def write_excel(boards,name):workbook = xlrd.open_workbook(name)sheets = workbook.sheet_names()worksheet = workbook.sheet_by_name(sheets[0])rows_old = worksheet.nrowsnew_workbook = copy(workbook)new_worksheet = new_workbook.get_sheet(0)i = 0for borad in boards:print(borad)n = i + rows_oldnew_worksheet.write(n, 0, borad['title'])new_worksheet.write(n, 1, borad['href'])new_worksheet.write(n, 2, borad['watch-num'])new_worksheet.write(n, 3, borad['icon-time'])new_worksheet.write(n, 4, borad['icon-hide'])new_worksheet.write(n, 5, borad['up-name'])i += 1new_workbook.save(name)# 初始化 Excel
def excel_write(str):# 设置要存入的 Excel 表book = xlwt.Workbook(encoding='utf-8')sheet = book.add_sheet('蔡徐坤打篮球')into = ['名称','网页链接','观看次数','上传时间','弹幕数量','UP 主']for i in range(len(into)):sheet.write(0,i,into[i])book.save(str)def main(name):url = 'https://www.bilibili.com/'try:total = search(url,name)print("一共:" + str(total) + "页" )for i in range(2,(int(total)+1)):next_page(i,name)finally:browser.close()if __name__ == '__main__':Excel_name = '蔡徐坤打篮球.xls'excel_write(Excel_name)main(Excel_name)