Pyhton基础爬虫教程(xpath实际操作)

发布时间 2023-11-05 00:23:55作者: dd随风

xpath解析实战

tree= etree.parse("./test.html")
# 普通定位
res = tree.xpath("/html/head/title")[0]
# 获取所有标签
res = tree.xpath("//div")
# 索引定位->
res = tree.xpath("//div[1]")
# 属性定位 //tag_name[@attr_name="value"]
res= tree.xpath('//div[@class="song"]')
# 层级定位 定位到li 标签下的 a 标签
res = tree.xpath('//div[@class="tang"]/ul/li/a')
res = tree.xpath('//div[@class="tang"]//li/a')
print(res)
# 数据提取
res = tree.xpath('//a[@id="feng"]/text()') # 提取a标签下面的文本
print(res)
res = tree.xpath('//div[@class="song"]/p/text()')
print(res)

爬取美女图片

import requests
from lxml import etree
import os


# 模拟请求头
headers = {
    "UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76"
}
# 爬取5页数据
for page in range(1,6):
    if page == 1:
        url = "http://pic.netbian.com/4kmeinv/"
    else:
        url = f"http://pic.netbian.com/4kmeinv/index_{page}.html"
    url_text = requests.get(url=url,headers=headers)
    url_text.encoding= "gbk"  # 乱码解决
    # print(url_text.text)
    tree = etree.HTML(url_text.text)
    li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
    for li in li_list:
        next_url = "https://pic.netbian.com"+li.xpath("./a/@href")[0]  # 拼接图片路径
        title = li.xpath("./a/b/text()")[0]
        res = requests.get(url=next_url,headers=headers).text
        next_tree = etree.HTML(res)
        image_src = "https://pic.netbian.com"+next_tree.xpath('//*[@id="img"]/img/@src')[0]
        image_bin = requests.get(url=image_src,headers=headers).content  # 获取图片二进制数据
        if not os.path.exists("./彼岸图库美女"):
            os.mkdir("./彼岸图库美女")
        image_path = "./彼岸图库美女/"+title+".jpg"
        with open(image_path,"wb") as fp:
            fp.write(image_bin)
        print(f"{title}:下载成功~")

小说章节及内容爬取

import requests
from lxml import etree

url = "https://bixuejian.5000yan.com/"

headers ={"UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76"}
response = requests.get(url,headers)
response.encoding="utf-8"
tree = etree.HTML(response.text)
# 解析a标签获取link及title
result = tree.xpath("/html/body/div[2]/div[1]/main/ul/li/a")

for i in result:
    title = i.xpath("./text()")[0]
    novel_url = i.xpath("./@href")[0]
    novel_response = requests.get(novel_url,headers=headers)
    novel_response.encoding="utf-8"
    novel_result = etree.HTML(novel_response.text)
    content = novel_result.xpath("/html/body/div[2]/div[1]/main/section/div[1]//text()")
    content_res = "".join(content).strip()
    with open(f"./碧血剑章节详情内容/{title}.txt","w",encoding="utf-8") as f:
        f.write(content_res)
        print(f"{title}写入成功")

懒加载爬取图片


from lxml import etree
import requests


url = "https://sc.chinaz.com/tupian/meinvtupian.html"
res = requests.get(url=url,headers={
    "UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76"
})
res.encoding="utf-8"
tree = etree.HTML(res.text)
# res = tree.xpath('//div/img/@data-original')
res = tree.xpath('/html/body/div[3]/div[2]/div/img/@data-original')  # 数据懒加载,正式的数据在data-original里面,而不是正常的src/href中
print(res)
for i in res:
    image_url = "https:"+i
    image = requests.get(image_url,headers={
        "UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76",
        "Referer":"https://sc.chinaz.com/"
    })
    image_name = i.split("/")[-1]
    with open(f"./download/{image_name}","wb") as f:
        f.write(image.content)
    print("下载完成")

爬取简历模板

import requests
from lxml import etree
import os



headers = {
    "UserAgent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76"
}

for i in range(1,6):
    if i == 1:
        url = "https://sc.chinaz.com/jianli/free.html"
    else:
        url = f"https://sc.chinaz.com/jianli/free_{i}.html"
    article_res = requests.get(url,headers)
    article_res.encoding="utf-8"
    tree = etree.HTML(article_res.text)
    div_list = tree.xpath('//*[@id="container"]/div')
    for b in div_list:
        curriculum_vitae_name = b.xpath('./p/a/text()')[0]  # 注意需要添加./
        curriculum_vitae_url = b.xpath('./p/a/@href')[0]
        print(curriculum_vitae_name,curriculum_vitae_url)
        curriculum_detail = requests.get(url=curriculum_vitae_url,headers=headers)
        detail_tree = etree.HTML(curriculum_detail.text)
        download_url = detail_tree.xpath('//*[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
        res_content = requests.get(download_url,headers=headers)
        if not os.path.exists("./站长素材简历"):
            os.mkdir("./站长素材简历/")
        with open(f"./站长素材简历/{curriculum_vitae_name}.rar","wb") as f:
            f.write(res_content.content)