06-scrapy的使用

发布时间 2024-01-02 11:28:23作者: Way*yy

scrapy解析数据

# 运行爬虫程序
scrapy crawl cnblogs

##### 可以项目目录下写个main.py
from scrapy.cmdline import execute
execute(['scrapy','crawl','cnblogs','--nolog'])

#### 重点
1 response对象有css方法和xpath方法
	-css中写css选择器     response.css('')
    -xpath中写xpath选择   response.xpath('')
2 重点1:
	-xpath取文本内容
	'.//a[contains(@class,"link-title")]/text()'
    -xpath取属性
    './/a[contains(@class,"link-title")]/@href'
    -css取文本
    'a.link-title::text'
    -css取属性
    'img.image-scale::attr(src)'
3 重点2:
	.extract_first()  取一个
    .extract()        取所有
import scrapy


class CnblogsSpider(scrapy.Spider):
    name = "cnblogs"
    allowed_domains = ["www.cnblogs.com"]
    start_urls = ["https://www.cnblogs.com"]

    def parse(self, response):
        article_list = response.css('article.post-item')
        for item in article_list:
            name = item.css("a.post-item-title::text").extract_first()
            article = item.css("a.post-item-author span::text").extract_first()
            article_img = item.css("img.avatar::attr(src)").extract_first()
            time = item.css("span.post-meta-item span::text").extract_first()
            likes = item.css("a.post-meta-item span::text").extract_first()
            comments = item.xpath(".//section/footer/a[3]/span/text()").extract_first()
            num_views = item.xpath(".//section/footer/a[4]/span/text()").extract_first()
            desc = item.xpath('./section/div/p/text()').extract()  # 文本内容可能放在第二个位置
            desc_content = desc[0].replace('\n', '').replace(' ', '')
            if not desc_content:
                desc_content = desc[1].replace('\n', '').replace(' ', '')
            desc_add = item.css('section > div > a::attr(href)').extract_first()
            print(
                '''
            文章标题:%s
            文章作者:%s
            文章摘要:%s
            文章地址:%s
            头像:%s
            发布时间:%s
            点赞数:%s
            评论数:%s
            观看数:%s
            ''' % (name, article, desc_content, desc_add, article_img, time, likes, comments, num_views))

配置文件

#### 基础配置
# 项目名
BOT_NAME = "scrapy_demo"
# 爬虫所在路径
SPIDER_MODULES = ["scrapy_demo.spiders"]
NEWSPIDER_MODULE = "scrapy_demo.spiders"

# 记住  日志级别
LOG_LEVEL='ERROR'


# 请求头中的  USER_AGENT
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"

# 是否遵循爬虫协议
ROBOTSTXT_OBEY = False



# 默认请求头
#DEFAULT_REQUEST_HEADERS = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#    "Accept-Language": "en",
#}

#爬虫中间件
#SPIDER_MIDDLEWARES = {
#    "scrapy_demo.middlewares.ScrapyDemoSpiderMiddleware": 543,
#}

# 下载中间件
#DOWNLOADER_MIDDLEWARES = {
#    "scrapy_demo.middlewares.ScrapyDemoDownloaderMiddleware": 543,
#}



# 持久化相关
#ITEM_PIPELINES = {
#    "scrapy_demo.pipelines.ScrapyDemoPipeline": 300,
#}



### 高级配置(提高爬取效率)
#1 增加并发:默认16
默认scrapy开启的并发线程为32个,可以适当进行增加。在settings配置文件中修改
CONCURRENT_REQUESTS = 100
值为100,并发设置成了为100

#2 提高日志级别:
在运行scrapy时,会有大量日志信息的输出,为了减少CPU的使用率。可以设置log输出信息为INFO或者ERROR即可。在配置文件中编写:
LOG_LEVEL = 'INFO'


# 3 禁止cookie:
如果不是真的需要cookie,则在scrapy爬取数据时可以禁止cookie从而减少CPU的使用率,提升爬取效率。在配置文件中编写:
COOKIES_ENABLED = False

# 4 禁止重试:
对失败的HTTP进行重新请求(重试)会减慢爬取速度,因此可以禁止重试。在配置文件中编写:
RETRY_ENABLED = False

# 5 减少下载超时:
如果对一个非常慢的链接进行爬取,减少下载超时可以能让卡住的链接快速被放弃,从而提升效率。在配置文件中进行编写:
DOWNLOAD_TIMEOUT = 10 超时时间为10s

整站爬取cnblogs-->爬取详情-->数据传递

# 整站爬取:
	爬取所有页
    	-解析出下一页 yield Request(url=next, callback=self.parse)
        
    爬取文章详情
    	-解析出详情地址:yield Request(url=url, callback=self.detail_parser)
        
    多个Request之间数据传递
    	yield Request(url=url,meta={'item':item})
        在解析的 response中 response.meta.get('item')
from scrapy import Request

import scrapy


class CnblogsSpider(scrapy.Spider):
    name = "cnblogs"
    allowed_domains = ["www.cnblogs.com"]
    start_urls = ["https://www.cnblogs.com"]

    def parse(self, response):
        article_list = response.css('article.post-item')
        for item in article_list:
            name = item.css("a.post-item-title::text").extract_first()
            article = item.css("a.post-item-author span::text").extract_first()
            article_img = item.css("img.avatar::attr(src)").extract_first()
            time = item.css("span.post-meta-item span::text").extract_first()
            likes = item.css("a.post-meta-item span::text").extract_first()
            comments = item.xpath(".//section/footer/a[3]/span/text()").extract_first()
            num_views = item.xpath(".//section/footer/a[4]/span/text()").extract_first()
            desc = item.xpath('./section/div/p/text()').extract()  # 文本内容可能放在第二个位置
            desc_content = desc[0].replace('\n', '').replace(' ', '')
            if not desc_content:
                desc_content = desc[1].replace('\n', '').replace(' ', '')
            desc_add = item.css('section > div > a::attr(href)').extract_first()
            print(
                '''
            文章标题:%s
            文章作者:%s
            文章摘要:%s
            文章地址:%s
            头像:%s
            发布时间:%s
            点赞数:%s
            评论数:%s
            观看数:%s
            ''' % (name, article, desc_content, desc_add, article_img, time, likes, comments, num_views))
            item = {"name": name, "url": desc_add, "img": article_img, "text": None}
            yield Request(url=desc_add, callback=self.detail_parser, meta={'item': item})
        n_next = 'https://www.cnblogs.com' + response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first()
        print(n_next)
        yield Request(url=n_next, callback=self.parse)

    def detail_parser(self, response):
        item = response.meta.get("item")
        text = response.css('#cnblogs_post_body').extract_first()
        print(text)
        item['text'] = text

持久化

# 方式一:(parse必须有return值,必须是列表套字典形式--->使用命令,可以保存到json格式中,csv中。。。)

# cnblogs.py
from scrapy import Request
from requests_demo.items import RequestsDemoItem
import scrapy


class CnblogsSpider(scrapy.Spider):
    name = "cnblogs"
    allowed_domains = ["www.cnblogs.com"]
    start_urls = ["https://www.cnblogs.com"]

    def parse(self, response):
        article_list = response.css('article.post-item')
        for item in article_list:
            name = item.css("a.post-item-title::text").extract_first()
            article = item.css("a.post-item-author span::text").extract_first()
            article_img = item.css("img.avatar::attr(src)").extract_first()
            time = item.css("span.post-meta-item span::text").extract_first()
            likes = item.css("a.post-meta-item span::text").extract_first()
            comments = item.xpath(".//section/footer/a[3]/span/text()").extract_first()
            num_views = item.xpath(".//section/footer/a[4]/span/text()").extract_first()
            desc = item.xpath('./section/div/p/text()').extract()  # 文本内容可能放在第二个位置
            desc_content = desc[0].replace('\n', '').replace(' ', '')
            if not desc_content:
                desc_content = desc[1].replace('\n', '').replace(' ', '')
            desc_add = item.css('section > div > a::attr(href)').extract_first()
            item = RequestsDemoItem(name=name, article=article, desc_content=desc_content, desc_add=desc_add,
                                    article_img=article_img, time=time, likes=likes, comments=comments,
                                    num_views=num_views, text=None)
            yield Request(url=desc_add, callback=self.detail_parser, meta={'item': item})
        n_next = 'https://www.cnblogs.com' + response.xpath('//div[@class="pager"]/a[last()]/@href').extract_first()
        yield Request(url=n_next, callback=self.parse)

    def detail_parser(self, response):
        item = response.meta.get("item")
        text = response.css('#cnblogs_post_body').extract_first()
        item['text'] = text
        print(item)
        yield item

        
# items.py
import scrapy


class RequestsDemoItem(scrapy.Item):
    # name, article, desc_content, desc_add, article_img, time, likes, comments, num_views
    name = scrapy.Field()
    article = scrapy.Field()
    desc_content = scrapy.Field()
    desc_add = scrapy.Field()
    article_img = scrapy.Field()
    time = scrapy.Field()
    likes = scrapy.Field()
    comments = scrapy.Field()
    num_views = scrapy.Field()
    text = scrapy.Field()
    
# pipelines.py  存到文件中

from itemadapter import ItemAdapter


class RequestsDemoPipeline:
    # 这个很重要 
    def __init__(self):
        self.f = None

    def open_spider(self, spider):
        print('开了')
        # 打开文件
        self.f = open('cnblogs.text', 'at', encoding='utf-8')

    def process_item(self, item, spider):
        self.f.write('文章标题:%s,作者:%s,作者头像:%s\n' % (item['name'], item['author'], item['img']))

        return item

    def close_spider(self, spider):
        print('关了')
        self.f.close()