scrapy框架之选择器

发布时间 2023-12-08 15:17:42作者: 木屐呀
 1 from scrapy.selector import Selector, HtmlXPathSelector
 2 from scrapy.http import HtmlResponse
 3 html = """<!DOCTYPE html>
 4 <html>
 5     <head lang="en">
 6         <meta charset="UTF-8">
 7         <title></title>
 8     </head>
 9     <body>
10         <ul>
11             <li class="item-"><a id='i1' href="link.html">first item</a></li>
12             <li class="item-0"><a id='i2' href="llink.html">first item</a></li>
13             <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
14         </ul>
15         <div><a href="llink2.html">second item</a></div>
16     </body>
17 </html>
18 """
19 response = HtmlResponse(url='http:example.com',body=html,encoding='utf-8')
20 # hxs = HtmlXPathSelector(response=response)
21 # print(hxs)
22 
23 # hxs = Selector(response=response).xpath('//a')
24 # print(hxs)
25 
26 # hxs = Selector(response).xpath('//a[1]')
27 # print(hxs)
28 
29 # hxs = Selector(response).xpath('//a[@id]')
30 # print(hxs)
31 
32 # hxs = Selector(response).xpath('//a[@id="i1"]')
33 # print(hxs)
34 
35 # hxs = Selector(response).xpath('//a[@id="i1"]')
36 # print(hxs)
37 
38 # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') #两个属性
39 # print(hxs)
40 
41 # hxs = Selector(response=response).xpath('//a[contains(@href,"link")]') #包含
42 # print(hxs)
43 
44 # hxs = Selector(response=response).xpath('//a[starts-with(@href,"link")]') #开头
45 # print(hxs)
46 
47 # hxs = Selector(response=response).xpath('//a[re:test(@href,"llink\d+")]')
48 # print(hxs)
49 
50 # hxs = Selector(response=response).xpath('//a[re:test(@href,"llink\d+")]/text()').extract() #文本
51 # print(hxs)
52 
53 # hxs = Selector(response=response).xpath('//a[re:test(@href,"llink\d+")]/@href').extract() #href连接
54 # print(hxs)
55 
56 # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract() #一层一层递进
57 # print(hxs)
58 
59 # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() #取第一个
60 # print(hxs)
61 
62 # ul_list = Selector(response=response).xpath('//body/ul/li')
63 # for item in ul_list:
64 #     v = item.xpath('./a/span')
65 #     # 或
66 #     # v = item.xpath('a/span')
67 #     # 或
68 #     # v = item.xpath('*/a/span')
69 #     print(v)