大数据分析与可视化 之 猫眼电影爬虫

发布时间 2023-12-30 20:35:05作者: Ivan丶c

大数据分析与可视化 之 猫眼电影爬虫

import random
import time
import re
import requests
import csv

class MaoyanSpider(object):

    # 初始化
    # 定义初始页面url
    def __init__(self):
        self.url = 'https://www.maoyan.com/board/4?offset={}'

    def get_html(self,url):
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept - Encoding':'gzip, deflate, br',
            'Accept - Language':'zh - CN, zh;q = 0.9, en;q = 0.8, en - GB;q = 0.7, en - US;q = 0.6',
            'Cache - Control':'max - age = 0',
            'Connection':'keep-alive',
            'Cookie':'__mta=142521997.1695026370028.1695105604302.1695106030738.15; uuid_n_v=v1; uuid=E13BF08055FE11EEBD786D8B351D1BEB69DF40D3DDA545AF98A7B7303437B12C; _lxsdk_cuid=18aa771d411c8-0d1e3cae321997-78505774-16e360-18aa771d411c8; _lxsdk=E13BF08055FE11EEBD786D8B351D1BEB69DF40D3DDA545AF98A7B7303437B12C; _csrf=db721765308a1674e7cba19e5867dbf60fb59d96717ba3913e47e499ab4726c0; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1695026370,1695086835; _lx_utm=utm_source%3Dbing%26utm_medium%3Dorganic; __mta=142521997.1695026370028.1695087067256.1695090012297.10; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1695106030; _lxsdk_s=18aac26180a-d75-779-1ac%7C%7C9',
            'Host':'www.maoyan.com',
            'Referer':'https://www.maoyan.com/board?requestCode=1d65f67ff588bed531de86b8d57274ac6pskg',
            'Sec-Ch-Ua':'"Microsoft Edge"; v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
            'Sec-Ch-Ua-Mobile':'?0',
            'Sec-Ch-Ua-Platform':'"Windows"',
            'Sec-Fetch-Dest':'document',
            'Sec-Fetch-Mode':'navigate',
            'Sec-Fetch-Site':'same-origin',
            'Sec-Fetch-User':'?1',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31'

        }
        res = requests.get(url=url, headers=headers)
        html = res.text
        #测试
        # print(res.text)
        # 直接调用解析函数
        self.parse_html(html)



    def parse_html(self,html):
        # 正则表达式
        re_bds = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>'
        # 生成正则表达式对象
        pattern = re.compile(re_bds, re.S)
        r_list = pattern.findall(html)
        # 测试
        # print(r_list)
        self.save_html(r_list)

    def save_html(self,r_list):
        # 生成文件对象
        with open('maoyan.csv', 'a', newline='', encoding="utf-8") as f:
            # 生成csv操作对象
            writer = csv.writer(f)
            # 整理数据
            for r in r_list:
                name = r[0].strip()
                star = r[1].strip()[3:]
                # 上映时间:2018-07-05
                # 切片截取时间
                time = r[2].strip()[5:15]
                L = [name, star, time]
                # 写入csv文件
                writer.writerow(L)
                print(name, time, star)

     # 主函数
    def run(self):
         # 抓取前6页数据
        for offset in range(0, 61, 10):
            url = self.url.format(offset)
            self.get_html(url)
            # 生成1-2之间的浮点数
            time.sleep(random.uniform(1, 2))

 # 以脚本方式启动
if __name__ == '__main__':
    # 捕捉异常错误
    try:
        spider = MaoyanSpider()
        spider.run()
    except Exception as e:
         print("错误:", e)