python 爬取原力文档日语学习资料

发布时间 2023-06-18 21:10:51作者: 裏表異体

参考

https://blog.csdn.net/weixin_46184311/article/details/115291441

代码

import requests, json, re, time, urllib.request
import time
import wget


def getParameter(url):  # 获取文档参数
    text_response = requests.get(url=url, headers=headers).text
    actual_page = int(re.search('actual_page: (\d+), //真实页数', text_response).group(1))  # 页数
    aid = re.search('aid: (\d+), //解密后的id', text_response).group(1)  # aid
    view_token = re.search('view_token: \'(.*?)\'', text_response).group(1)  # view_token
    print('actual_page:', actual_page, '\naid:', aid, '\nview_token:', view_token)
    return actual_page, aid, view_token


def requests_data(parameter, page):  # 请求数据
    url = 'https://openapi.book118.com/getPreview.html'
    params = {
        'project_id': '1',
        'aid': parameter[1],
        'view_token': parameter[2],
        'page': page,
    }
    response = requests.get(url=url, headers=headers, params=params).text
    json_data = re.search('jsonpReturn\((.*?)\);', response).group(1)  # 使用正则表达式所需数据
    data = json.loads(json_data)['data']
    return data



if __name__ == '__main__':
    results = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
    text_url = 'https://max.book118.com/html/2023/0208/6230222112005044.shtm'
    parameter = getParameter(text_url)
    print(parameter)
    for page in range(1, parameter[0]+1, 6):
        print(page)
        result = requests_data(parameter, page)
        print(result)
        for id, url in result.items():
            url = 'https:'+url
            print(f'downloading: {id}, {url} ')
            wget.download(url=url, out=f'imgs/{id}.png')
        time.sleep(1)