Python爬取猫眼Top100榜

发布时间 2023-04-14 00:26:32作者: 18900714155

网上看了不少代码,但没成功的,于是自己改造。

发起来自己记录一下,如果可以给其他人带来启发,那就更好了。

 

 

import pandas as pd
from urllib.parse import urljoin
from requests.exceptions import RequestException
import requests
import re
import os


# 获取网页HTML源代码
def get_html(url, headers):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


path = r"C:\Users\xiaochao\movie"   # 指定文件保存地址
filename = 'movies.xlsx'     # 指定文件保存名称
base_url = 'https://www.maoyan.com/'  # 基本地址,用于拼接成绝对地址
all_data = []   # 创建空列表用于存放电影数据

url_lists = ['https://www.maoyan.com/board/4?offset={}'.format(i*10) for i in range(0, 10)] # 范围0-9
# 设置请求头,特别是需要Cookie,不然爬下来的HTML中还有javascript代码。
headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
            'Cookie':'__mta=252622390.1681305679407.1681308687092.1681309131766.17; uuid_n_v=v1; uuid=E8D36BD0D93411EDB5DB33031B0079F30A3B137EF97F43D78B7CA70E897828F5; _csrf=15331524ca63b785430f8ebfba1c6bbba94e381c2e1c6c4305fa6e8fb23f3e92; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=18775a0b463ad-0837c20e7a4c2-26031b51-144000-18775a0b464c8; _lxsdk=E8D36BD0D93411EDB5DB33031B0079F30A3B137EF97F43D78B7CA70E897828F5; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1681305679; __mta=252622390.1681305679407.1681305679407.1681305682326.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1681309132; _lxsdk_s=18775a0b464-c50-1f0-84c%7C%7C36',
            'Host': 'www.maoyan.com',
            'Referer':'https://www.maoyan.com/board/4'
}

num_pattern = re.compile(r'<i class="board-index board-index-\d+">(?P<num>\d+)</i>')    #  匹配数字的正则表达示
name_pattern = re.compile(r'title="(?P<name>[^"]+)" class=')    #  匹配电影名称的正则表达示
url_pattern = re.compile(r'"name"><a href="(?P<href>/films/\d+)"')  #  匹配电影链接的正则表达示

for url in url_lists:
    print(url)
    html = get_html(url,headers)
    num_matches = re.findall(num_pattern, html)
    name_matches = re.findall(name_pattern, html)
    url_matches = re.findall(url_pattern, html)

    for i in range(len(num_matches)):  #循环取出上述3个字典中的值,放入date字典
        data = {
            '排名': num_matches[i],
            '电影名称': name_matches[i],
            '电影链接': urljoin(base_url, url_matches[i])  #将相对地址拼接为绝对地址
        }
        all_data.append(data)  #电影数据存入all_date字典


# 创建DataFrame对象
df = pd.DataFrame(all_data)

# 创建文件目录
if not os.path.exists(path):
    os.makedirs(path)
# 将数据写入Excel文件
df.to_excel(os.path.join(path, filename), index=False)
print("数据已成功写入指定Excel文件")