xpath丶BeautifulSoup丶pyquery丶jsonpath 解析html与json串

发布时间 2023-07-25 20:51:56作者: 看一百次夜空里的深蓝

XPath与jsonpath

 1 import json
 2 from jsonpath import jsonpath
 3 
 4 def json_test():
 5     str1 = '{"name":"埃里克森"}'
 6     # 将字符串转为Python dict对象
 7     js_obj = json.loads(str1)
 8     print( type(js_obj) )
 9 
10     # 将dict转成,json字符串
11     json_str = json.dumps(js_obj, ensure_ascii=False)
12     print( type(json_str) )
13 
14     # 将json对象保存到文件
15     json.dump(js_obj, open('test.txt', 'w', encoding='utf-8'), ensure_ascii=False )
16 
17     # 从文件中读取json对象
18     jso = json.load(open('test.txt', 'r', encoding='utf-8'))
19     print( type(jso) )
20 
21 def jsonpath_test():
22     # JsonPath是Json版的XPath,正如XPath之于XML文档一样。
23     # XPath   JsonPath  说明
24     # | /     | $       | 文档根元素
25     # | .     | @       | 当前元素
26     # | /     | . or [] | 匹配下级元素
27     # | ..    | N/A     | 匹配上级元素,JsonPath不支持此操作符
28     # | //    | ..      | 递归匹配所有子元素
29     # | *     | *       | 通配符,匹配下级元素
30     # | @     | N/A     | 匹配属性,Jsonath不支持此操作
31     # | []    | []      | 下标运算,根据索引获取元素,XPath下标从1开始,jsonPath从0开始
32     # | |     | [,]     | 连接操作符,将多个届uo拼接成数组返回,可以使用索引或别名政局政局
33     # | N/A   | [start:end:step]| 数据切片操作
34     # | []    | ?()     | 过滤表达式
35     # | N/A   | ()      | 脚本表达式,使用底层脚本引擎,XPath不支持
36     # | ()    | N/A     | 分组,JsonPath不支持
37     str1 = '''
38         {
39             "store": {
40                 "book": [{
41                         "category": "reference",
42                         "author": "Nigel Rees",
43                         "title": "Sayings of the Century",
44                         "price": 8.95
45                     }, {
46                         "category": "fiction",
47                         "author": "Evelyn Waugh",
48                         "title": "Sword of Honour",
49                         "price": 12.99
50                     }, {
51                         "category": "fiction",
52                         "author": "Herman Melville",
53                         "title": "Moby Dick",
54                         "isbn": "0-553-21311-3",
55                         "price": 8.99
56                     }, {
57                         "category": "fiction",
58                         "author": "J. R. R. Tolkien",
59                         "title": "The Lord of the Rings",
60                         "isbn": "0-395-19395-8",
61                         "price": 22.99
62                     }
63                 ],
64                 "bicycle": {
65                     "color": "red",
66                     "price": 19.95
67                 }
68             }
69         }
70     '''
71     js_obj = json.loads(str1)
72     res1 = jsonpath(js_obj, '$.store.book[*].author') # 所有book的author节点  (对应xpath:"/store/book/author")
73     res2 = jsonpath(js_obj, '$..author') # 所有author节点   (对应xpath:"//author")
74     res3 = jsonpath(js_obj, '$.store.*') # store下的所有节点:book数组和bicycle节点  (对应xpath:"/store")
75     res4 = jsonpath(js_obj, '$.store..price') # store下的所有price节点  (对应xpath:"/store//price")
76     res5 = jsonpath(js_obj, '$..book[2]') # 匹配book第三个节点  (对应xpath:"//book[2]")
77     res6 = jsonpath(js_obj, '$..book[(@.length-1)]') # 匹配一个book节点  (对应xpath:"//book[last()]")
78     res7 = jsonpath(js_obj, '$..book[-1:]')
79     res8 = jsonpath(js_obj, '$..book[0,1]') # 匹配两个节点
80     res9 = jsonpath(js_obj, '$..book[:2]')  # (对应xpath:"//book[position()<3]")
81     res10 = jsonpath(js_obj, '$..book[?(@.price<10)]') # 过滤price<10的节点  (对应xpath:"//book[price<10]")
82     res11 = jsonpath(js_obj, '$..*') # 递归匹配所有子结点  (对应xpath:"//*")
83     print( res11 )
84 
85 if __name__ == '__main__':
86     jsonpath_test()
BeautifulSoup
 1 from bs4 import BeautifulSoup, Comment
 2 
 3 # 具体更详细的用法找官网
 4 str1 = '''
 5 <title id='title1'>世界和平</title>
 6 <div class='info' float='left'>Welcome to My Space</div>
 7 <div class='info' float='right'>
 8     <span>Good good study</span>
 9     <a href='www.baidu.com'>baidu</a>
10     <strong><!--这是一段注释-->test</strong>
11 </div>
12 '''
13 
14 soup = BeautifulSoup(str1, 'lxml')
15 print(soup.title)
16 print(soup.div)
17 
18 print(soup.div.attrs)
19 print(soup.div.get('class'))
20 print(soup.div['float'])
21 print(soup.a['href'])
22 
23 print(soup.div.string)
24 print(soup.div.text)
25 
26 print(type(soup.strong.string))
27 if type(soup.strong.string) == Comment:
28     print(soup.strong.string)
29     print(soup.strong.pretify())
30 else:
31     print(soup.strong.text)
32 
33 print('===========find_all()===========')
34 print(soup.find_all('title'))
35 print(soup.find_all(id='title'))
36 print(soup.find_all(class_='info'))
37 print(soup.find_all(attrs={'float':'left'}))
38 
39 
40 print('===========css()===========')
41 print(soup.select('title'))
42 # print(soup.select(id='title'))  报错
43 print(soup.select('#title1'))
44 # print(soup.select(class_='info')) 报错
45 print(soup.select('.info')[0])
46 print(soup.select('div > span'))
47 print(soup.select('div')[1].select('a'))
PyQuery
 1 from pyquery import PyQuery
 2 
 3 def main():
 4     # 可加载一段HTML字符串,或一个HTML文件,或是一个url地址
 5     # doc = PyQuery('<html><title>test</title><head></head><body></body></html>')
 6     # doc = PyQuery(filename='path_to_html_file')
 7     # doc = PyQuery(url='https://www.baidu.com/')
 8 
 9     # html()和text() 获取相应的html块或文本块
10     doc = PyQuery('<html><head><title>test</title></head><head></head><body></body></html>')
11     doc('head').html() # 返回:<title>test</title>
12     doc('head').text()  # 返回:test
13 
14     # 根据html标签来获取元素
15     doc = PyQuery('<html><title>test</title><head></head><body><p>1</p><p>2</p></body></html>')
16     print( doc('p') ) # 返回:<p>1</p><p>2</p>
17     print( doc('p').html() ) # 返回: 1   # 注意:当获取的到的元素不只一个时,html()只返回首个元素内容
18     # eq(index)  根据给定的索引号得到指定元素,若想得到第二个p标签的内容,则可以:
19     print( doc('p').eq(1).html() ) # 返回: 2
20 
21     # filter() 根据类名,id名得到指定的元素
22     doc = PyQuery(r'<html><title>test</title><head></head><body><p id="p_id">1</p><p class="p_class">2</p></body></html>')
23     print( doc('p').filter('#p_id') ) # 返回: <p id="p_id">1</p>
24     print( doc('p').filter('.p_class') ) # 返回: <p class="p_class">2</p>
25 
26     # find() 查找嵌套元素
27     doc = PyQuery(r'<div><p id="p_id">1</p><p class="p_class">2</p></div>')
28     print( doc('div').find('p') )  # 返回: <p id="p_id">1</p><p class="p_class">2</p>
29     print( doc('div').find('p').eq(0) ) # 返回: <p id="p_id">1</p>
30 
31     # 直接根据类名获取元素
32     doc = PyQuery(r'<div><p id="p_id">1</p><p class="p_class">2</p></div>')
33     print( doc('#p_id').html() ) # 返回: 1
34     print( doc('.p_class').html() ) # 返回: 2
35 
36     # 获取属性值
37     doc = PyQuery(r'<div><p id="p_id">1</p><p class="p_class">2</p></div>')
38     print( doc('p').eq(0).attr('id') )
39     # 修改属性值
40     doc('p').eq(1).attr('class', 'p_class test')
41     print( doc('div').html() )
42     # add_class() 为元素增加类
43     doc('p').eq(0).add_class('p_class_id')
44     print(doc('div').html())
45     # hasClass(classname) 判断元素是否包含类
46     print( doc('p').eq(1).has_class('p_class') )
47 
48     # 获取子元素
49     doc = PyQuery(r'<body><div><p id="p_id">1</p><p class="p_class">2<span>tspan</span></p></div></body>')
50     print( doc('div').children() )
51     print( doc('div').children('#p_id') )
52     print( doc('div').children('span') ) # 不能跨多层级获取元素
53     # 获取父元素
54     print(doc('#p_id').parent().eq(0))
55 
56     # clone() 获取节点的拷贝
57     # empty() 移除节点内容
58 
59     # next_all() 返回后面全部元素
60     doc = PyQuery(r'<div><p id="p_id">1</p><p class="p_class">2<span>tspan</span></p></div>')
61     print( doc('div').children('#p_id').next_all() )
62     print(doc('div').children('p:first').next_all())
63     print(doc('div').children('p:last').next_all())
64     # 返回不匹配选择器的元素
65     print( doc('p').not_('#p_id') )
66 
67 
68 
69 if __name__ == '__main__':
70     main()