1 from scrapy.selector import Selector, HtmlXPathSelector 2 from scrapy.http import HtmlResponse 3 html = """<!DOCTYPE html> 4 <html> 5 <head lang="en"> 6 <meta charset="UTF-8"> 7 <title></title> 8 </head> 9 <body> 10 <ul> 11 <li class="item-"><a id='i1' href="link.html">first item</a></li> 12 <li class="item-0"><a id='i2' href="llink.html">first item</a></li> 13 <li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li> 14 </ul> 15 <div><a href="llink2.html">second item</a></div> 16 </body> 17 </html> 18 """ 19 response = HtmlResponse(url='http:example.com',body=html,encoding='utf-8') 20 # hxs = HtmlXPathSelector(response=response) 21 # print(hxs) 22 23 # hxs = Selector(response=response).xpath('//a') 24 # print(hxs) 25 26 # hxs = Selector(response).xpath('//a[1]') 27 # print(hxs) 28 29 # hxs = Selector(response).xpath('//a[@id]') 30 # print(hxs) 31 32 # hxs = Selector(response).xpath('//a[@id="i1"]') 33 # print(hxs) 34 35 # hxs = Selector(response).xpath('//a[@id="i1"]') 36 # print(hxs) 37 38 # hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]') #两个属性 39 # print(hxs) 40 41 # hxs = Selector(response=response).xpath('//a[contains(@href,"link")]') #包含 42 # print(hxs) 43 44 # hxs = Selector(response=response).xpath('//a[starts-with(@href,"link")]') #开头 45 # print(hxs) 46 47 # hxs = Selector(response=response).xpath('//a[re:test(@href,"llink\d+")]') 48 # print(hxs) 49 50 # hxs = Selector(response=response).xpath('//a[re:test(@href,"llink\d+")]/text()').extract() #文本 51 # print(hxs) 52 53 # hxs = Selector(response=response).xpath('//a[re:test(@href,"llink\d+")]/@href').extract() #href连接 54 # print(hxs) 55 56 # hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract() #一层一层递进 57 # print(hxs) 58 59 # hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first() #取第一个 60 # print(hxs) 61 62 # ul_list = Selector(response=response).xpath('//body/ul/li') 63 # for item in ul_list: 64 # v = item.xpath('./a/span') 65 # # 或 66 # # v = item.xpath('a/span') 67 # # 或 68 # # v = item.xpath('*/a/span') 69 # print(v)