【Python爬虫】Scrapy框架图片下载_桌面壁纸ZOL(纯案例)

发布时间 2023-12-13 15:47:24作者: 小C学安全

Spider代码

class BizhizolSpider(scrapy.Spider):
    name = "bizhizol"
    allowed_domains = ["zol.com.cn"]
    start_urls = ["https://desk.zol.com.cn/youxi/"]

    def parse(self, response,**kwargs):
        # print(response.text)
        res_list_li = response.xpath('//*[@class="pic-list2  clearfix"]/li')
        # print(res_list_li)
        for res_list in res_list_li:
            img_url = res_list.xpath('./a/@href').extract_first()
            if img_url.endswith(".exe"):
                continue
            # print(img_url)
            """
            python  URL拼接
            # from urllib.parse import urljoin
            # print(urljoin(response.url,img_url))
            """
            #使用scrapy自带的拼接,其实也是调用了urllib模块
            child_url =response.urljoin(img_url)
            # print(child_url)

            #拿到图片的URL需要重新发起请求
            yield Request(
                url=child_url,
                method="get",
                callback=self.suibianqimignzi,
            )
    def suibianqimignzi(self, response,**kwargs):
        img_src = response.xpath("//*[@id='bigImg']/@src").extract_first()
        # print(img_src)
        yield {
            "img_src":img_src
        }

Pepiline代码

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
# ImagesPipeline 图片专用的管道
import scrapy
from scrapy.pipelines.images import ImagesPipeline

class BizhiPipeline:
    def process_item(self, item, spider):
        return item

class MyTuPipeline(ImagesPipeline):
    # 1. 发送请求(下载图片, 文件, 视频,xxx)
    def get_media_requests(self, item, info):
        url = item['img_src']
        headers = {
            'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
            'Referer': 'https://desk.zol.com.cn/showpic/1920x1080_100899_144.html',
            'sec-ch-ua-mobile': '?0',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
            'sec-ch-ua-platform': '"Windows"',
        }
        yield scrapy.Request(url=url,headers=headers, meta={"sss": url})  # 直接返回一个请求对象即可

    # 2. 图片的存储路径
    # 完整的路径: IMAGES_STORE + file_path()的返回值
    # 在这个过程中. 文件夹自动创建
    def file_path(self, request, response=None, info=None, *, item=None):
        # 可以准备文件夹
        img_path = "/youxi/"
        print(request)
        # 准备文件名字
        # 坑: response.url 没办法正常使用
        # file_name = response.url.split("/")[-1]  # 直接用响应对象拿到url
        # print("response:", file_name)
        file_name = item['img_src'].split("/")[-1]  # 用item拿到url
        print("item:", file_name)
        file_name = request.meta['sss'].split("/")[-1]
        print("meta:", file_name)

        real_path = img_path + "/" + file_name  # 文件夹路径拼接
        return real_path  # 返回文件存储路径即可

    # 3. 可能需要对item进行更新
    def item_completed(self, results, item, info):
        # print(results)
        for r in results:
            print(r[1]['path'])
        return item  # 一定要return item 把数据传递给下一个管道

效果展示