beautifulsoup4的使用

文本内容

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p><p class="story">...</p>

获取标签

多个标签的情况下之返回第一个标签

soup = BeautifulSoup(open("Index.html"),"lxml")>>> soup.title
<title>The Dormouse's story</title>
>>> soup.title.name
'title'
>>> soup.title.string
"The Dormouse's story"

获取父标签

>>> soup.title.parent
<head><title>The Dormouse's story</title></head>
>>> soup.title.parent.name
'head'
>>> soup.title.parent.string
"The Dormouse's story"

只返回第一个标签

>>> soup.p
<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>
>>> soup.p.name
'p'
>>> soup.p.string
"The Chapter 1 of the Dormouse's story">>> soup.p.parent
<body>
<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p></body>

>>> soup.a
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.a.name
'a'
>>> soup.a.string
'Elsie'

获取标签的属性值

>>> soup.p["class"]
['title']
>>> soup.a["href"]
'http://example.com/elsie'
>>> soup.a["id"]

修改和删除标签的属性值

soup.a["href"] = "how can I see you ?" 
print(soup.a["href"])
# how can I see you ?del soup.a["href"] 
print(soup.a)
# <a class="sister" id="link1">Elsie</a>

根据属性查找标签

>>> soup.find(href="http://example.com/elsie")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.find(id="link2")
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>>>> soup.find(class_="sister")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

find_all() 查找所有标签

>>> soup.find_all("title")
[<title>The Dormouse's story</title>]>>> soup.find_all("p")
[<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p>, 
<p class="story">...</p>]>>> soup.find_all("a")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

for link in soup.find_all("a"):print(link["id"],"--",link["class"],"--", link["href"])
"""
link1 -- ['sister'] -- http://example.com/elsie
link2 -- ['sister'] -- http://example.com/lacie 
link3 -- ['sister'] -- http://example.com/tillie
"""

soup.get_text()
"""
The Dormouse's storyThe Chapter 1 of the Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""

三个对象

>>> type(soup)
<class 'bs4.BeautifulSoup'>>>> type(soup.p)
<class 'bs4.element.Tag'>>>> type(soup.p.string)
<class 'bs4.element.NavigableString'>

数组获取子标签

>>> soup.head.contents
[<title>The Dormouse's story</title>]>>> soup.body.contents[3].contents
['Once upon a time there were three little sisters; and their names were\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, ',\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']>>> soup.body.contents
"""
['\n', <p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, '\n', <p class="story">...</p>]
"""

children获取子标签

print(soup.body.children," ",type(soup.body.children)) 
# <list_iterator object at 0x0000026C0D7EDBD0>   <class 'list_iterator'>p_3 = soup.body.contents[3].contents
print(p_3)
a_1=p_3[1]
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a># 标签内的文本也是子节点
for child in a_1.children:print(child) # Elsiefor cc in a_1.contents:   # a_1.contents==['Elsie']print(cc)    # Elsie

descendants获取子孙标签

for child in soup.head.children:print(child)
# <title>The Dormouse's story</title>for dd in soup.head.descendants:print(dd)
# <title>The Dormouse's story</title>
# The Dormouse's story

CSS选择

1. 标签名查找

print(soup.select("title"))  #[<title>The Dormouse's story</title>]
print(soup.select("b"))      #[<b>The Dormouse's story</b>]

2. 类名查找

print(soup.select(".sister")) 
"""
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
"""

3. id名查找

print(soup.select("#link1"))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

4. 组合查找

print(soup.select("p #link3"))
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

子标签查找

print(soup.select("p > #link3"))
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

把pascal voc(.xml)格式转为yolo(.txt)格式

单个文件转换

from bs4 import BeautifulSoupimport os def pasvcal_voc_to_yolo(xml_file,class_mapping,output_path):with open(xml_file,"r") as f:soup = BeautifulSoup(f,"xml")width = int( soup.size.width.string )height = int( soup.size.height.string)yolo_format = []for obj in soup.find_all("object"):# class_name = obj.name.string 报错# class_name = obj.find("name").text也可以class_name = obj.find("name").stringif class_name in class_mapping:class_index = class_mapping[class_name]# 也可以xmin = int(obj.find('bndbox').find('xmin').text)xmin = int( obj.bndbox.xmin.string )ymin = int( obj.bndbox.ymin.string )xmax = int( obj.bndbox.xmax.string )ymax = int( obj.bndbox.ymax.string )x_center = (xmin+xmax) / 2 / widthy_center = (ymin+ymax) / 2 / heightbbox_width = (xmax-xmin) / widthbbox_height = (ymax-ymin) / heightyolo_format.append(f"{class_index} {x_center} {y_center} {bbox_width} {bbox_height}")txt_filename = os.path.basename(xml_file).replace(".xml",".txt")output_filename = os.path.join(output_path,txt_filename)with open(output_filename,"w") as f :for label in yolo_format:f.write(f"{label}\n")class_mapping={"car":0
}src = "./0.xml"
dist = os.getcwd()pasvcal_voc_to_yolo(src,class_mapping,dist)

本文来自互联网用户投稿，该文观点仅代表作者本人，不代表本站立场。本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如若转载，请注明出处：http://www.rhkb.cn/news/457796.html

如若内容造成侵权/违法违规/事实不符，请联系长河编程网进行投诉反馈email:809451989@qq.com，一经查实，立即删除！