文本内容
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p><p class="story">...</p>
获取标签
多个标签的情况下之返回第一个标签
soup = BeautifulSoup(open("Index.html"),"lxml")>>> soup.title
<title>The Dormouse's story</title>
>>> soup.title.name
'title'
>>> soup.title.string
"The Dormouse's story"
获取父标签
>>> soup.title.parent
<head><title>The Dormouse's story</title></head>
>>> soup.title.parent.name
'head'
>>> soup.title.parent.string
"The Dormouse's story"
只返回第一个标签
>>> soup.p
<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>
>>> soup.p.name
'p'
>>> soup.p.string
"The Chapter 1 of the Dormouse's story">>> soup.p.parent
<body>
<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p></body>
>>> soup.a
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.a.name
'a'
>>> soup.a.string
'Elsie'
获取标签的属性值
>>> soup.p["class"]
['title']
>>> soup.a["href"]
'http://example.com/elsie'
>>> soup.a["id"]
修改和删除标签的属性值
soup.a["href"] = "how can I see you ?"
print(soup.a["href"])
# how can I see you ?del soup.a["href"]
print(soup.a)
# <a class="sister" id="link1">Elsie</a>
根据属性查找标签
>>> soup.find(href="http://example.com/elsie")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
>>> soup.find(id="link2")
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>>>> soup.find(class_="sister")
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
find_all() 查找所有标签
>>> soup.find_all("title")
[<title>The Dormouse's story</title>]>>> soup.find_all("p")
[<p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>, <p class="story">Once upon a time there were three little sisters; and their names were<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;and they lived at the bottom of a well.</p>,
<p class="story">...</p>]>>> soup.find_all("a")
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
for link in soup.find_all("a"):print(link["id"],"--",link["class"],"--", link["href"])
"""
link1 -- ['sister'] -- http://example.com/elsie
link2 -- ['sister'] -- http://example.com/lacie
link3 -- ['sister'] -- http://example.com/tillie
"""
soup.get_text()
"""
The Dormouse's storyThe Chapter 1 of the Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
三个对象
>>> type(soup)
<class 'bs4.BeautifulSoup'>>>> type(soup.p)
<class 'bs4.element.Tag'>>>> type(soup.p.string)
<class 'bs4.element.NavigableString'>
数组获取子标签
>>> soup.head.contents
[<title>The Dormouse's story</title>]>>> soup.body.contents[3].contents
['Once upon a time there were three little sisters; and their names were\n', <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, ',\n', <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, ' and\n', <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>, ';\nand they lived at the bottom of a well.']>>> soup.body.contents
"""
['\n', <p class="title"><b>The Chapter 1 of the Dormouse's story</b></p>, '\n', <p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>, '\n', <p class="story">...</p>]
"""
children获取子标签
print(soup.body.children," ",type(soup.body.children))
# <list_iterator object at 0x0000026C0D7EDBD0> <class 'list_iterator'>p_3 = soup.body.contents[3].contents
print(p_3)
a_1=p_3[1]
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a># 标签内的文本也是子节点
for child in a_1.children:print(child) # Elsiefor cc in a_1.contents: # a_1.contents==['Elsie']print(cc) # Elsie
descendants获取子孙标签
for child in soup.head.children:print(child)
# <title>The Dormouse's story</title>for dd in soup.head.descendants:print(dd)
# <title>The Dormouse's story</title>
# The Dormouse's story
CSS选择
1. 标签名查找
print(soup.select("title")) #[<title>The Dormouse's story</title>]
print(soup.select("b")) #[<b>The Dormouse's story</b>]
2. 类名查找
print(soup.select(".sister"))
"""
[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
"""
3. id名查找
print(soup.select("#link1"))
# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
4. 组合查找
print(soup.select("p #link3"))
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
子标签查找
print(soup.select("p > #link3"))
# [<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
把pascal voc(.xml)格式转为yolo(.txt)格式
单个文件转换
from bs4 import BeautifulSoupimport os def pasvcal_voc_to_yolo(xml_file,class_mapping,output_path):with open(xml_file,"r") as f:soup = BeautifulSoup(f,"xml")width = int( soup.size.width.string )height = int( soup.size.height.string)yolo_format = []for obj in soup.find_all("object"):# class_name = obj.name.string 报错# class_name = obj.find("name").text也可以class_name = obj.find("name").stringif class_name in class_mapping:class_index = class_mapping[class_name]# 也可以xmin = int(obj.find('bndbox').find('xmin').text)xmin = int( obj.bndbox.xmin.string )ymin = int( obj.bndbox.ymin.string )xmax = int( obj.bndbox.xmax.string )ymax = int( obj.bndbox.ymax.string )x_center = (xmin+xmax) / 2 / widthy_center = (ymin+ymax) / 2 / heightbbox_width = (xmax-xmin) / widthbbox_height = (ymax-ymin) / heightyolo_format.append(f"{class_index} {x_center} {y_center} {bbox_width} {bbox_height}")txt_filename = os.path.basename(xml_file).replace(".xml",".txt")output_filename = os.path.join(output_path,txt_filename)with open(output_filename,"w") as f :for label in yolo_format:f.write(f"{label}\n")class_mapping={"car":0
}src = "./0.xml"
dist = os.getcwd()pasvcal_voc_to_yolo(src,class_mapping,dist)