1.爬虫文件
class MeiShiSpider(scrapy.Spider): name = 'meishi' allowed_domains = ['baidu.com'] start_urls = ['https://tieba.baidu.com/f?kw=美食'] def parse(self, response): # 获取视频地址在网页的数据范围 data = re.findall(r'(<ul id="thread_list".*?)<div class="thread_list_bottom clearfix">', response.text, re.S)[0] # 获取一页中的所有的视频地址 video_urls = re.findall(r'data-video="(.*?)"', data) for url in video_urls: item = {} # 从视频地址中提取出文件名与格式 item['name'] = url.split('?')[0].split('/')[-1] item['url'] = url # 视频地址 yield item
2.管道文件
from scrapy.pipelines.files import FilesPipeline # 自定义一个类,继承FilesPipeline这个父类 class VideoDownloadPipeline(FilesPipeline): def get_media_requests(self, item, info): # 依次对视频地址发送请求,meta用于传递视频的文件名 yield scrapy.Request(url=item['url'], meta={'name': item['name']}) def file_path(self, request, response=None, info=None, *, item=None): filename = request.meta['name'] # 获取视频文件名 return filename # 返回下载的视频文件名 def item_completed(self, results, item, info): return item
3.设置文件
USER_AGENT = UserAgent().random # 随机请求头 LOG_LEVEL = 'WARNING' # 设置日志级别 FILES_STORE = r'G:\视频' # 保存视频文件的文件夹 CONCURRENT_REQUESTS = 3 # 设置并发数 DOWNLOAD_DELAY = 1 # 设置下载延时 COOKIES_ENABLED = False # 关闭cookie ITEM_PIPELINES = { 'video.pipelines.VideoDownloadPipeline': 300, # 打开管道 }