抓包爬取的一些措施

发布时间 2023-07-28 14:49:00作者: hacker_dvd
import requests
from fake_useragent import UserAgent
from lxml import etree
import time
import random
import pymysql
import hashlib

def set_hash(sr):
    md5 = hashlib.md5()
    md5.update(sr.encode('utf-8'))
    return md5.hexdigest()

for page in range(1, 11):
    # 此 url 是通过翻页翻到底,然后清除所有包,点击下一页得到的url
    url = f'https://www.douyu.com/gapi/rkc/directory/mixList/2_1/{page}'
    res = requests.get(url, headers={'User-Agent': UserAgent().random})
    ls = res.json()['data']['rl']  # 观察 Preview,是一个json
    for dt in ls:
        title = dt['rn']  # 标题
        if not title:
            continue
        author = dt['nn']  # 作者
        level = dt['od']
        hot_num = dt['ol']  # 热度
        banner_url = dt['rs16']
        img_res = requests.get(banner_url, headers={'User-Agent': UserAgent().random})
        img_name = f'{set_hash(banner_url)}.jpg'
        with open(f'static/img/{img_name}', 'wb') as f:
            f.write(img_res.content)
            print(f'{img_name}下载成功')
        
    print(f'第{page}页爬取成功')
    time.sleep(1)