import requests
from fake_useragent import UserAgent
from lxml import etree
import time
import random
import pymysql
import hashlib
def set_hash(sr):
md5 = hashlib.md5()
md5.update(sr.encode('utf-8'))
return md5.hexdigest()
for page in range(1, 11):
# 此 url 是通过翻页翻到底,然后清除所有包,点击下一页得到的url
url = f'https://www.douyu.com/gapi/rkc/directory/mixList/2_1/{page}'
res = requests.get(url, headers={'User-Agent': UserAgent().random})
ls = res.json()['data']['rl'] # 观察 Preview,是一个json
for dt in ls:
title = dt['rn'] # 标题
if not title:
continue
author = dt['nn'] # 作者
level = dt['od']
hot_num = dt['ol'] # 热度
banner_url = dt['rs16']
img_res = requests.get(banner_url, headers={'User-Agent': UserAgent().random})
img_name = f'{set_hash(banner_url)}.jpg'
with open(f'static/img/{img_name}', 'wb') as f:
f.write(img_res.content)
print(f'{img_name}下载成功')
print(f'第{page}页爬取成功')
time.sleep(1)
抓包爬取的一些措施
发布时间 2023-07-28 14:49:00作者: hacker_dvd