'''
// 开头表示从根节点开始查找
/ 表示从当前节点的直接子节点开始查找
. 表示当前节点
[] 修饰当前标签
@ 修饰当前标签的属性
text() 获取标签的文本内容
@* 获取当前标签的所有属性
/@href 获取当前标签的 href 属性
'''
from lxml import etree
import re
html = '''
<html>
<head>
<title>测试</title>
<style>
#one {
color: red;
}
</style>
</head>
<body>
<div id="one">
hello
</div>
<a href="http://www.baidu.com" class="bai">百度</a>
<a href="http://www.baidu.com" class="tao">百度</a>
<span>python</span>
<span>java</span>
<span>c++</span>
<p class="one">123</p>
<p class="two">456</p>
<p class="three">789</p>
</body>
</html>
'''
e = etree.HTML(html)
con = e.xpath('//div/text()')
href = e.xpath('//a[@class="bai"]/@href')
span = e.xpath('//span[2]/text()') # java
p = e.xpath('//p[3]/text()') # 789
p = e.xpath('//p[@class="three"]/text()') # 789
# 包含 contains
html = '''
<div name="user1">python</div>
<div name="user2">html</div>
<div title="user3">css</div>
<div title="rooter4">js</div>
<div title="hello5">xpath</div>
'''
e = etree.HTML(html)
con = e.xpath('//div[contains(@title|@name, "er")]/text()')
print(con) # ['python', 'html', 'css', 'js']
print(e.xpath('//div[5]/@title')) # hello5
print(e.xpath('//div[text()="xpath"]/@title')) # hello5
# 获取 html 源码
html = '''
<html>
<head>
<title>测试</title>
<style>
#one {
color: red;
}
</style>
</head>
<body>
<div id="one">
<div>123</div>
<div>456</div>
<div>789</div>
<span>abc</span>
<span>def</span>
<span>ghi</span>
</div>
<div class="two">hello</div>
</body>
</html>
'''
# 需求 获取 id="one" 的 div 下的所有 div 的文本
e = etree.HTML(html)
div = e.xpath('//div[@id="one"]')
res = etree.tostring(div[0], encoding='utf-8', pretty_print=True).decode('utf-8')
print(res)
# 将所有的空白字符替换成空
sr = re.sub(r'\s+', '', res)
print(sr)
lxml
发布时间 2023-07-19 10:04:17作者: hacker_dvd