[案例]贴吧爬取并获取图片

发布时间 2023-07-17 11:20:54作者: 蕝戀
import os
import random
import re
import sys
import time
import urllib.parse

import requests
from lxml import etree
from lxml.etree import _Element


class TiebaSpider(object):

    BASE_DIR = os.path.dirname(__file__)

    def __init__(self, url, name):
        self.url = url
        self.name = name
        self.header = {
            "Host": "tieba.baidu.com",
            # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/104.0",
            "User-Agent": "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT)"
        }

        # 创建目录,用于保存图片
        if not os.path.exists(self.name):
            print(f"创建目录:{self.name}")
            os.mkdir(os.path.join(self.BASE_DIR, self.name))

    def get_data(self, pagesize):
        """获取响应体"""
        # 带了UA去请求和不带UA去请求返回的内容不一样,证明网站都是有检测UA的,以前做易语言POST都做过了....
        resp = requests.get(f"{self.url}{urllib.parse.quote(self.name)}&ie=utf-8&pn={pagesize * 50}",
                            headers=self.header)
        return resp.content

    def parse_data(self, data):
        """
        1、帖子列表://div[@class="threadlist_title pull_left j_th_tit"]
            如果item下能获取到i元素就是置顶帖: ./i[@title="置顶"]   置顶帖排除出去。
            
            每个帖子的url:
                ./a/@href
                
            每个帖子title:./a
            
        2、再次访问每个帖子的url,获取其中的图片并下载(只获取帖子第一页的,就不写这么多了...)

        """
        etree_html: _Element = etree.HTML(data.decode())

        tiezi_elements = etree_html.xpath(
            '//div[@class="threadlist_title pull_left j_th_tit"] | //div[@class="threadlist_title pull_left j_th_tit "]')

        clist = []
        for element in tiezi_elements:
            title = element.xpath("./a/text()")[0]
            url = element.xpath("./a/@href")[0]
            clist.append({
                "title": title,
                "url": url
            })
        return clist

    def parse_detail(self, data):
        """遍历列表中的每个帖子,发起请求,获取帖子中图片地址"""
        for item in data:
            # 补充完整的url
            full_url = "https://tieba.baidu.com" + item["url"]
            print(f"正在获取帖子【{item.get('title')}】中的图片...")
            # 发送请求
            resp = requests.get(full_url, headers=self.header)
            content = resp.content.decode()
            # 初始化tree
            etree_html: _Element = etree.HTML(content)
            # 获取图片
            src_list = etree_html.xpath('//img[@class="BDE_Image"]/@src')
            print(src_list)
            print("*" * 150)

            # 以帖子名创建子目录,调用保存图片的方法保存图片
            tiezi_dir = os.path.join(self.BASE_DIR, os.path.join(self.name, item["title"]))
            try:
                os.mkdir(tiezi_dir)
            except:
                pass
            print(tiezi_dir + "已经存在,跳过创建目录。")
            self.save_pics(tiezi_dir, src_list)

    def save_pics(self, dirname, pic_list):
        """
        :param dirname: 帖子的目录path
        :param pic_list: 帖子中的图片url列表
        :return:
        """
        print(dirname, pic_list)

        for pic_url in pic_list:
            # 通过正则拿到原来的图片名
            # 用replace是因为它的sign=69f653d42559252da3171d0c049a032c/f0d931adcbef7609fb3928516bdda3cc7dd99ef8.jpg
            # 所以就替换掉/就i算了
            pic_name = re.search(r'sign=(.*)\?tbpicau=', pic_url).group(1).replace("/", "")
            # 图片属于二进制数据,用b模式
            with open(os.path.join(dirname, pic_name), "wb") as f:
                # 贴吧图片下载需要指定另外的请求头,不然会提示403
                f.write(requests.get(pic_url, headers={
                    "Host": "tiebapic.baidu.com",
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/114.0",
                }).content)
                f.flush()

    def run(self, size):
        """这里就没有用xpath拿到下一页了,没啥意思,就这样循环几次就行了"""
        for i in range(1, size + 1):
            print(f"正在读取【{self.name}】贴吧的第{i}页数据....")
            data = self.get_data(i)

            # 获取页中的帖子url地址
            parse_data = self.parse_data(data)
            # 解析并获取每个帖子中的图片地址并保存
            self.parse_detail(parse_data)
            # 延迟一下,防止被反扒检测到太快了...
            time.sleep(random.randint(1, 5))


if __name__ == '__main__':

    """获取指定贴吧的每个帖子中的图片并保存"""

    if len(sys.argv) >= 3:
        name = sys.argv[1]
        pages = int(sys.argv[2])
    else:
        pages = 1
        name = "海贼王"

    url = "https://tieba.baidu.com/f?kw="

    TiebaSpider(url, name).run(pages)