scrapy当当网练习

发布时间 2023-10-04 16:20:07作者: sgj191024
    def parse(self, response):
        print('当当网')
        li = response.xpath('//ul[@id="component_59"]/li')
      #src,name,price有个共同的父元素li,但是对于第一个li,没有data-original,所以遍历根据li的索引判断是否为none for item in li: srcFirst = item.xpath('./a/img/@src') src = item.xpath('./a/img/@data-original') name = item.xpath('./a/img/@alt')
        #获取内容 price = item.xpath( './p[@class="price"]/span[@class="search_now_price"]/text()') if(src.extract_first()): resSrc = 'http:' + src.extract_first() else: resSrc = 'http:' + srcFirst.extract_first() resName = name.extract_first() resPrice = price.extract_first() print(resSrc,resName,resPrice) book = ScrapyproItem(src=resSrc,name=resName,price=resPrice) #交给pipeline yield book pass

  settings.py

ITEM_PIPELINES = {
   'scrapyPro.pipelines.ScrapyproPipeline': 300,
}

  items.py

class ScrapyproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    src = scrapy.Field()
    name = scrapy.Field()
    price = scrapy.Field()
    pass

  piplines.py

class ScrapyproPipeline:
    def process_item(self, item, spider):
        with open('book.json','a',encoding='utf-8')as fp:
            fp.write(str(item))
        return item

  新定义一个pepeline用来下载图片:

class DangDownloadPicture:

    def process_item(self, item, spider):
        url = item.get('src')
        name = './books/' + item.get('name') + '.jpg'
        urllib.request.urlretrieve(url=url,filename=name)

        return item

  settings.py  301表示优先级,数字越小优先级越高

ITEM_PIPELINES = {
   'scrapyPro.pipelines.DangDownloadPicture': 301,
}

  下载100页的图片和json数据:

class DangSpider(scrapy.Spider):
    name = 'dang'
    allowed_domains = ['category.dangdang.com']
    start_urls = ['http://category.dangdang.com/cp01.01.02.00.00.00.html']
    # http://category.dangdang.com/pg2-cp01.01.02.00.00.00.html
    base_url = 'http://category.dangdang.com/pg'
    page = 1
    def parse(self, response):
        print('当当网')
        li = response.xpath('//ul[@id="component_59"]/li')
        for item in li:
            srcFirst = item.xpath('./a/img/@src')
            src = item.xpath('./a/img/@data-original')
            name = item.xpath('./a/img/@alt')
            price = item.xpath(
                './p[@class="price"]/span[@class="search_now_price"]/text()')
            if(src.extract_first()):
                resSrc = 'http:' + src.extract_first()
            else:
                resSrc = 'http:' + srcFirst.extract_first()

            resName = name.extract_first()
            resPrice = price.extract_first()
            print(resSrc,resName,resPrice)
            book = ScrapyproItem(src=resSrc,name=resName,price=resPrice)
            #交给pipeline
            yield book
        pass

        if self.page < 100:
            self.page = self.page + 1
            url = self.base_url + str(self.page) + '-cp01.01.02.00.00.00.html'
            yield scrapy.Request(url=url,callback=self.parse)