自学Python爬虫笔记（day7）-526互联

环境python3.9版本及以上，开发工具pycharm

requests的进阶使用：

案例一模拟用户登录-处理cookie：

# 登录 -> 得到cookie
# 带着cookie 去请求到暑假url -> 书架上的内容

# 必须把上面两个操作连起来
# 我们可以使用session进行请求 -> session你可以认为是一连串的请求，在这个过程中的cookie不会丢失
import requests

# # 会话
# session = requests.session()
# data = {
#     "loginName": "18975575097",
#     "password": "hy.1211"
# }
#
# # 1.登录
# url = "https://passport.17k.com/ck/user/login"
# session.post(url, data=data)
# # print(resp.text)
# # print(resp.cookies)       # 看cookie
#
# # 拿书架上的数据
# # 刚使用的session中是有cookie的
# resp = session.get('https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919')
#
# print(resp.json())


# 暴力爬取！
resp = requests.get("https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919", headers = {
    "Cookie": "GUID=69519599-f002-43d3-a75d-629c3a6a99d9; sajssdk_2015_cross_new_user=1; __bid_n=187839078a08ad067b4207; Hm_lvt_9793f42b498361373512340937deb2a0=1681539496; FPTOKEN=qhSCogWrXfGuSVgVyrv5ugmyHMCk09hR09J4vwLe0ciErADRvtr5jcAk+9ZVtxemgBMe0YtpS/FNlGnyQjBBy86M33gU8fqmjxDEgV51pvABJY4f3EmMoOFVL1jVracFtg91N7m6HmRPjTrLYkYm40zgHeQF4inRorhmupWootM/w3CRy6ccjOjXZWnLmMIDuo7CFnjR8ooJNXNNLQmgNLl/Ft8AoPAjLi+uyKrFphfCrrcCw6WWZKp9AqXPkd/nE5z9VUSuebEcGOC8d8ZV0+JwhAOq5OupVvsqmC0AGKEVhfEO4rxrC7s2OkffXD3ZA472EdyMm1LDtupi42vNCu0t9zEOVz/6hbY3+CHJuf94fatzLwcqC9IrTL+UgoTLpvt5G/ozsAljI9p8EiC3NQ==|Q2G++K0W9yBUzbXcPpwV+HwjFl0HOP4N1PLWtwuBdOU=|10|f8d67ccba53f0bb3f3fdc404f8182129; c_channel=0; c_csc=web; BAIDU_SSP_lcr=https://api.weibo.com/; accessToken=avatarUrl%3Dhttps%253A%252F%252Fcdn.static.17k.com%252Fuser%252Favatar%252F08%252F28%252F67%252F100156728.jpg-88x88%253Fv%253D1681540416000%26id%3D100156728%26nickname%3D%25E4%25B9%25A6%25E5%258F%258B6K7GZ6La3%26e%3D1697092533%26s%3D62ff98a0444d1c29; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22100156728%22%2C%22%24device_id%22%3A%22187839078371431-02353e314bf148-26031b51-2073600-1878390783892a%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%A4%BE%E4%BA%A4%E7%BD%91%E7%AB%99%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fapi.weibo.com%2F%22%2C%22%24latest_referrer_host%22%3A%22api.weibo.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%2C%22first_id%22%3A%2269519599-f002-43d3-a75d-629c3a6a99d9%22%7D; Hm_lpvt_9793f42b498361373512340937deb2a0=1681540866"
})
print(resp.text)

案例二防盗链的处理-爬取梨视频：

# 1.拿到contId
# 2.拿到videoStatus返回的json. -> srcURL
# 3.srcURL里面的内容进行修整、
# 4.下载视频
import requests

# 拉取视频的网址
url = "https://pearvideo.com/video_1721911"
contId = url.split('_')[1]

videoStatus = f"https://pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.12818526288984744"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
    # 防盗链：溯源，当前本次请求的上一级是谁
    "Referer": url
}

resp = requests.get(videoStatus, headers=headers)
dic = resp.json()
srcUrl = dic['videoInfo']['videos']['srcUrl']
systemTime = dic['systemTime']
srcUrl = srcUrl.replace(systemTime, f'cont-{contId}')
# print(srcUrl)

# 下载视频
with open("a.mp4", mode="wb") as f:
    f.write(requests.get(srcUrl).content)

案例三代理：

# 代理，可以使用第三方的机器来代理你的请求
# 代理的弊端：
#   1.慢
#   2.代理ip不好找
import requests

# https://www.kuaidaili.com/free/intr/

url = "https://www.baidu.com"

# 准备代理信息
proxy = {
    "http": "http://114.233.70.231:9000",
    "https": "https://114.233.70.231:9000"
}

# proxies
resp = requests.get(url, proxies=proxy)
resp.encoding = "utf-8"
print(resp.text)

案例四接入第三方代理：

import requests

# 程序有待完善，有可能代理ip弄完，我不知道！！
def get_ip():
    url = "……"
    resp = requests.get(url)
    ips = resp.json()
    for ip in ips['data']['proxy_list']:  # 拿到每一个ip
        yield ip  # 一个一个返回代理ip


def spider():
    url = "https://www.baidu.com"
    while 1:
        try:
            proxy_ip = next(gen)  # 拿到代理ip
            proxy = {
                "http": "http://" + proxy_ip,
                "https": "https://" + proxy_ip,
            }
            resp = requests.get(url, proxies=proxy)
            resp.encoding = 'utf-8'
            return resp.text
        except:
            print("报错了。")


if __name__ == "__main__":
    gen = get_ip()  # gen就是代理ip的生成器
    for i in range(10):
        spider()

OK，基础过完，还得好好夯实夯实，后面烧脑的来了！

20230410 python day7 day