python学习第九节:爬虫实战-抓取地址库
话不多说,直接上代码;下面的代码是从统计局抓取地址库并保存为json格式和excel格式。大家拿到代码直接运行即可。
#coding=utf-8
#加入上面这行代码表示可以在下面代码中包含中文
import bs4 #网页解析,获取数据
import time #引入time模块
import re #正则表达式,进行文字匹配
import urllib.request,urllib.error #制定URL,获取网页数据
import xlwt # 进行excel操作
import sqlite3 # 进行SQLite数据库操作import
#urllib.parse模块是一个用于解析URL的工具包,支持各种对URL的操作,包括拆分、拼接、编码、解码等。
import urllib.parse
# 解析网页
from bs4 import BeautifulSoup
#定义全局变量
durl="https://www.stats.gov.cn/sj/tjbz/tjyqhdmhcxhfdm/2023/"
file_path="E://地址库"+str(time.time())+".json"
excel_path="E://地址库"+str(time.time())+".xls"# 爬取网页
def getHtml():# 定义全局变量response = Nonehtml = ""try:# 请求网页response = urllib.request.urlopen(durl+"index.html")# 获取网页响应内容html = response.read().decode("utf-8")except urllib.error.URLError as e:if hasattr(e, "code"): # 打印错误状态码print(e.code)if hasattr(e, "reason"): # 打印错误问题原因print(e.reason)# 如果返回状态不是200,说明访问失败if response.status != 200:print("---访问失败!")# 打印读取到的网页内容print(html)return html# 解析网页
def paseHtml(html):print("*" * 20, "开始解析网页", "*" * 20)bs = BeautifulSoup(html, "html.parser")# 获取页面中所有tb下的a标签 得到所有省级的a标签provinces = bs.select("td > a")data = []# 循环省级标签for provinceTag in provinces:name = provinceTag.texturl = provinceTag.attrs["href"]code = re.sub(".html", "", url) + "0000000000"print("省级名称:", name, " url=", url, " code=", code)child = getChild(durl,url)d = {"code": code,"name": name,"child": child}data.append(d)return data#获取子级地址
def getChild(purl,url):child = []html = ""qurl=purl + "" + urlprint("开始获取", qurl, "的信息")try:# 请求网页response = urllib.request.urlopen(qurl)# 获取网页响应内容html = response.read().decode("utf-8")except urllib.error.URLError as e:return child# 如果返回状态不是200,说明访问失败if response.status != 200:print("---访问"+url+"失败!")return childbs = BeautifulSoup(html, "html.parser")# 获取页面中所有class=citytr的childtrs = bs.select(".citytr,.countytr,.towntr,.villagetr")#print("=====",childtrs)# 循环标签for childtr in childtrs:#print("childtr=", childtr)url=""code=""name=""#兼容 没得a标签情况 childtr= <tr class="countytr"><td>130101000000</td><td>市辖区</td></tr>if len(childtr.find_all("td")[0].find_all("a"))==0:if len(childtr.select(".villagetr"))==0: # 说明这不是最后一层code = childtr.find_all("td")[0].textname = childtr.find_all("td")[1].textelse:code = childtr.find_all("td")[0].textname = childtr.find_all("td")[2].textelse:if "href" in childtr.find_all("td")[0].a.attrs:url = childtr.find_all("td")[0].a.attrs["href"]code = childtr.find_all("td")[0].a.textname = childtr.find_all("td")[1].a.text#print("name=", name, " code=", code, " url=", url)child2=[]if url!="":qurl = qurl[0:qurl.rindex("/")+1]child2 = getChild(qurl,url)d = {"code": code,"name": name,"child": child2}child.append(d)return child# 将将数据写入文件
def save_file(data):print("*"*20,"开始写入文件","*"*20)# 打开文件fo = open(file_path,"w",encoding="utf-8") # w表示打开一个文件只用于写入。如果该文件已存在则打开文件,并从开头开始编辑,即原有内容会被删除。如果该文件不存在,创建新文件# 写入内容fo.write(str(data))# 关闭文件fo.close()print("*" * 20, "写入文件完成", "*" * 20)# 将将数据写入excel
def save_excel(data):print("*" * 20, "开始写入excel", "*" * 20)book = xlwt.Workbook(encoding="utf-8", style_compression=0) # 创建workbook对象 新建一个工作簿对象sheet = book.add_sheet("2024地址库", cell_overwrite_ok=True) # 创建工作表 cell_overwrite_ok=True 重新写入会自动覆盖之前的内容不会报错# 创建表格表头col = ("地区代码", "地区名称", "等级", "父级代码")for i in range(len(col)):sheet.write(0, i, col[i])# 插入数据i=0# 解析省级for p in data:pcode=p["code"]pname=p["name"]plevel=1pf_code="0"pchild=p["child"]sheet.write(i + 1, 0, pcode)sheet.write(i + 1, 1, pname)sheet.write(i + 1, 2, plevel)sheet.write(i + 1, 3, pf_code)i+=1if len(pchild)==0:continue#解析市级for c in pchild:ccode = c["code"]cname = c["name"]clevel = 2cf_code = pcodecchild = c["child"]sheet.write(i + 1, 0, ccode)sheet.write(i + 1, 1, cname)sheet.write(i + 1, 2, clevel)sheet.write(i + 1, 3, cf_code)i += 1if len(cchild) == 0:continue# 解析区级for d in cchild:dcode = d["code"]dname = d["name"]dlevel = 3df_code = ccodedchild = d["child"]sheet.write(i + 1, 0, dcode)sheet.write(i + 1, 1, dname)sheet.write(i + 1, 2, dlevel)sheet.write(i + 1, 3, df_code)i += 1if len(cchild) == 0:continue# 解析镇级for a in dchild:acode = a["code"]aname = a["name"]alevel = 4af_code = dcodesheet.write(i + 1, 0, acode)sheet.write(i + 1, 1, aname)sheet.write(i + 1, 2, alevel)sheet.write(i + 1, 3, af_code)i += 1book.save(excel_path)print("*" * 20, "写入excel完成", "*" * 20)def run():html = getHtml()# 爬取网页data=paseHtml(html) # 解析网页save_file(data) #将数据存入文件save_excel(data) # 将数据写入excelif __name__ == '__main__':run()print("爬虫结束")