大数据分析与可视化 之 百度贴吧爬虫

发布时间 2023-12-30 20:35:05作者: Ivan丶c

大数据分析与可视化 之 百度贴吧爬虫

import csv
import datetime
import json
from urllib import request, parse
import time
import random
from fake_useragent import UserAgent
from lxml import etree
import requests


# 定义一个爬虫类
class TiebaSpider(object):
    # 初始化url属性
    def __init__(self):
        self.url = 'http://tieba.baidu.com/f?{}'

    # 1.请求函数,得到页面,传统三步
    def get_html(self, url):
        ua = UserAgent()
        response = requests.get(url=url, headers={'User-Agent': ua.random})
        # res = request.urlopen(req)
        # windows会存在乱码问题,需要使用 gbk解码,并使用ignore忽略不能处理的字节
        # linux不会存在上述问题,可以直接使用decode('utf-8')解码
        html = response.content.decode('utf-8').replace("<!--", "").replace("-->", "")
        return html

    # 2.解析函数,此处代码暂时省略,还没介绍解析模块
    def parse_html(self, html):
        eroot = etree.HTML(html)
        # 提取行数据
        li_list = eroot.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
        data = []
        for li in li_list:
            item = {}
            item["title"] = li.xpath('./text()')[0]
            item["link"] = 'https://tieba.baidu.com' + li.xpath('./@href')[0]
            data.append(item)
        return data

    def save_csv(self, data, filename):
        with open(filename, 'a', newline='', encoding='utf_8_sig') as csv_file:
            # 指定CSV文件的列名
            fieldnames = ['title', 'link']
            # 创建CSV写入器
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
            # 写入列名
            writer.writeheader()
            # 写入数据
            for row in data:
                writer.writerow(row)
        print(f'Data has been written to {filename}')

    # def save_items(self, items, filename):
    #     for data in items:
    #         # 将字典转换为字符串,将字符串编码为UTF-8字节流
    #         data_str = json.dumps(data).encode("utf-8")
    #         data_str = data_str.decode("unicode_escape")
    #         self.write(data_str, filename)
    #
    # def write(self, item, filename):
    #     # 打开文件进行追加写入
    #     with open(filename, "a", encoding="utf-8") as file:
    #         # 写入数据 将字节流解码为字符串
    #         file.write(item)
    #         file.write("\n")

    # 3.保存文件函数
    # def save_html(self, filename, html):
    #     with open(filename, 'w') as f:
    #         f.write(html)

    # 4.入口函数
    def run(self):
        # current_datetime = datetime.datetime.now()
        # 构建文件名,精确到小时和分钟
        # filename = current_datetime.strftime("%Y-%m-%d-%H-%M.csv")
        name = input('输入贴吧名:')
        begin = int(input('输入起始页:'))
        stop = int(input('输入终止页:'))
        # +1 操作保证能够取到整数
        for page in range(begin, stop + 1):
            pn = (page - 1) * 50
            params = {
                'kw': name,
                'pn': str(pn)
            }
            # 拼接URL地址
            params = parse.urlencode(params)
            url = self.url.format(params)
            # 发请求
            html = self.get_html(url)
            items = self.parse_html(html)
            # self.save_items(items, filename)
            filename = '{}-{}页.csv'.format(name, page)
            self.save_csv(items, filename)
            # 定义路径
            # filename = '{}-{}页.html'.format(name, page)
            # self.save_html(filename, html)
            # 提示
            print('第%d页抓取成功' % page)
            # 每爬取一个页面随机休眠1-2秒钟的时间
            time.sleep(random.randint(1, 2))


# 以脚本的形式启动爬虫
if __name__ == '__main__':
    start = time.time()
    spider = TiebaSpider()  # 实例化一个对象spider
    spider.run()  # 调用入口函数
    end = time.time()
    # 查看程序执行时间
    print('执行时间:%.2f' % (end - start))  # 爬虫执行时间