网上看了不少代码,但没成功的,于是自己改造。
发起来自己记录一下,如果可以给其他人带来启发,那就更好了。
import pandas as pd from urllib.parse import urljoin from requests.exceptions import RequestException import requests import re import os # 获取网页HTML源代码 def get_html(url, headers): try: response = requests.get(url, headers=headers) if response.status_code == 200: return response.text return None except RequestException: return None path = r"C:\Users\xiaochao\movie" # 指定文件保存地址 filename = 'movies.xlsx' # 指定文件保存名称 base_url = 'https://www.maoyan.com/' # 基本地址,用于拼接成绝对地址 all_data = [] # 创建空列表用于存放电影数据 url_lists = ['https://www.maoyan.com/board/4?offset={}'.format(i*10) for i in range(0, 10)] # 范围0-9 # 设置请求头,特别是需要Cookie,不然爬下来的HTML中还有javascript代码。 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36', 'Cookie':'__mta=252622390.1681305679407.1681308687092.1681309131766.17; uuid_n_v=v1; uuid=E8D36BD0D93411EDB5DB33031B0079F30A3B137EF97F43D78B7CA70E897828F5; _csrf=15331524ca63b785430f8ebfba1c6bbba94e381c2e1c6c4305fa6e8fb23f3e92; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=18775a0b463ad-0837c20e7a4c2-26031b51-144000-18775a0b464c8; _lxsdk=E8D36BD0D93411EDB5DB33031B0079F30A3B137EF97F43D78B7CA70E897828F5; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1681305679; __mta=252622390.1681305679407.1681305679407.1681305682326.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1681309132; _lxsdk_s=18775a0b464-c50-1f0-84c%7C%7C36', 'Host': 'www.maoyan.com', 'Referer':'https://www.maoyan.com/board/4' } num_pattern = re.compile(r'<i class="board-index board-index-\d+">(?P<num>\d+)</i>') # 匹配数字的正则表达示 name_pattern = re.compile(r'title="(?P<name>[^"]+)" class=') # 匹配电影名称的正则表达示 url_pattern = re.compile(r'"name"><a href="(?P<href>/films/\d+)"') # 匹配电影链接的正则表达示 for url in url_lists: print(url) html = get_html(url,headers) num_matches = re.findall(num_pattern, html) name_matches = re.findall(name_pattern, html) url_matches = re.findall(url_pattern, html) for i in range(len(num_matches)): #循环取出上述3个字典中的值,放入date字典 data = { '排名': num_matches[i], '电影名称': name_matches[i], '电影链接': urljoin(base_url, url_matches[i]) #将相对地址拼接为绝对地址 } all_data.append(data) #电影数据存入all_date字典 # 创建DataFrame对象 df = pd.DataFrame(all_data) # 创建文件目录 if not os.path.exists(path): os.makedirs(path) # 将数据写入Excel文件 df.to_excel(os.path.join(path, filename), index=False) print("数据已成功写入指定Excel文件")