python爬取手机壁纸

发布时间 2023-09-27 15:22:48作者: Airgity

无聊随便玩玩,要爬成功还早着呢,代码很乱可以整理,写了就记录一下吧,有机会再改。

import requests
import os
from bs4 import BeautifulSoup
from requests.packages import urllib3
import random
import threading
import time

urllib3.disable_warnings()


start_page = 1
end_page = 1

if not os.path.exists("gq_sjbz"):
    os.makedirs("gq_sjbz")

base_url = "https://www.3gbizhi.com/sjbz/index_{}.html"


time_out_urls = []
print()

def crawl_page(page):
    url = base_url.format(page)
    try:
        user_agent = random.choice(
            ['Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36',
             'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'])
        # 有用ip池的自己去买
        headers = {'User-Agent': user_agent}
        resp = requests.get(url, headers=headers, verify=False, timeout=20)
    except requests.exceptions.HTTPError as e:
        print(f"HTTP请求错误: {e}")
        return

    soup = BeautifulSoup(resp.text, 'html.parser')
    ul_element = soup.select("div.contlistw ul.cl")

    for ul in ul_element:
        a_href_s = ul.find_all('a', href=True)
        for a_href in a_href_s:
            href = a_href['href']
            resp2 = requests.get(href, headers=headers, verify=False, timeout=20)
            soup2 = BeautifulSoup(resp2.text, 'html.parser')

            # TODO: 当前只是下载了该界面的展示图,并不是高清原图,下一步下载高清原图,有空再写
            # TODO: 难度要调用接口拿到
            #  {
            #       "file": "/api/user/imageDownload?downconfig=e03UPLG76erry5Fo6ZT7Zw%3D%3D3gbizhiComgV1S3%2BO8DlxWKbNOuZ7BLw%3D%3D&op=file&picnum=1&captcha=267",
            #        "zip": "/api/user/imageDownload?downconfig=e03UPLG76erry5Fo6ZT7Zw%3D%3D3gbizhiComgV1S3%2BO8DlxWKbNOuZ7BLw%3D%3D&op=zip&picnum=1&captcha=267"
            #  }
            #  要绕过人机校验

            ul_element2 = soup2.select("div.img-table img#contpic")
            for ul2 in ul_element2:
                img_url = ul2['src']
                img_name = os.path.basename(img_url)
                try:
                    img_resp = requests.get(img_url, verify=False, timeout=3)
                    if img_resp.status_code == 200:
                        with open("gq_sjbz/" + img_name, "wb") as img_file:
                            img_file.write(img_resp.content)
                        print(f"Downloaded image: {img_url}    超时数量:{len(time_out_urls)}")
                    else:
                        time_out_urls.append(img_url)
                except:
                    time_out_urls.append(img_url)
    print(f"爬取第 {page} 页完成")


# 定义批量爬取的函数
def crawl_batch(start, end):
    for page in range(start, end + 1):
        crawl_page(page)


# 设定每批次爬取的页面数量
batch_size = 20


# 分批次爬取页面
def run_threads():
    threads = []
    for batch_start in range(start_page, end_page + 1, batch_size):
        batch_end = min(batch_start + batch_size - 1, end_page)
        thread = threading.Thread(target=crawl_batch, args=(batch_start, batch_end))
        threads.append(thread)
        thread.start()

    # 等待所有线程完成
    for thread in threads:
        thread.join()


# 启动多线程
run_threads()


def time_out_urls_download(max_download_time):
    start_time = time.time()  # 记录开始时间
    while len(time_out_urls) > 0:
        img_url = time_out_urls.pop(0)  # 从列表中取出第一个URL
        img_name = os.path.basename(img_url)
        try:
            img_resp = requests.get(img_url, verify=False, timeout=3)
            if img_resp.status_code == 200:
                with open("gq_sjbz/" + img_name, "wb") as img_file:
                    img_file.write(img_resp.content)
                print(f"Downloaded image: {img_url}    {len(time_out_urls)}")
            else:
                time_out_urls.append(img_url)
        except Exception as e:
            time_out_urls.append(img_url)

        elapsed_time = time.time() - start_time  # 计算已经过去的时间
        if elapsed_time > max_download_time:
            print(time_out_urls)
            break  # 超过指定时间后停止下载

# 设置最大下载时间(秒)
max_download_time = 1200  # 例如,设置为1小时
# 调用函数开始下载
time_out_urls_download(max_download_time)

print("爬取完成------------------------------------------------------------------------------")