lxml

发布时间 2023-07-19 10:04:17作者: hacker_dvd
'''
// 开头表示从根节点开始查找
/  表示从当前节点的直接子节点开始查找
.  表示当前节点

[] 修饰当前标签
@  修饰当前标签的属性
text() 获取标签的文本内容
@*   获取当前标签的所有属性
/@href 获取当前标签的 href 属性

'''
from lxml import etree
import re

html = '''
<html>

<head>
    <title>测试</title>
    <style>
        #one {
            color: red;
        }
    </style>
</head>

<body>
    <div id="one">
        hello
    </div>
    <a href="http://www.baidu.com" class="bai">百度</a>
    <a href="http://www.baidu.com" class="tao">百度</a>

    <span>python</span>
    <span>java</span>
    <span>c++</span>

    <p class="one">123</p>
    <p class="two">456</p>
    <p class="three">789</p>
</body>

</html>
'''

e = etree.HTML(html)

con = e.xpath('//div/text()')
href = e.xpath('//a[@class="bai"]/@href')

span = e.xpath('//span[2]/text()')  # java

p = e.xpath('//p[3]/text()')  # 789
p = e.xpath('//p[@class="three"]/text()')  # 789

# 包含 contains
html = '''

<div name="user1">python</div>
<div name="user2">html</div>
<div title="user3">css</div>
<div title="rooter4">js</div>
<div title="hello5">xpath</div>

'''

e = etree.HTML(html)
con = e.xpath('//div[contains(@title|@name, "er")]/text()')
print(con)  # ['python', 'html', 'css', 'js']

print(e.xpath('//div[5]/@title'))  # hello5
print(e.xpath('//div[text()="xpath"]/@title'))  # hello5

# 获取 html 源码
html = '''
<html>

<head>
    <title>测试</title>
    <style>
        #one {
            color: red;
        }
    </style>
</head>

<body>
    <div id="one">
        <div>123</div>
        <div>456</div>
        <div>789</div>
        <span>abc</span>
        <span>def</span>
        <span>ghi</span>
    </div>
    <div class="two">hello</div>
</body>

</html>
'''

# 需求 获取 id="one" 的 div 下的所有 div 的文本
e = etree.HTML(html)
div = e.xpath('//div[@id="one"]')
res = etree.tostring(div[0], encoding='utf-8', pretty_print=True).decode('utf-8')

print(res)
# 将所有的空白字符替换成空
sr = re.sub(r'\s+', '', res)
print(sr)