Python爬虫爬取wallhaven.cc图片

发布时间 2023-11-28 11:41:48作者: 酷酷的方

话不多说,直接上代码!

 1 import time
 2 import random
 3 import uuid
 4 
 5 from lxml import etree
 6 import os
 7 import requests
 8 import threading
 9 from queue import Queue
10 from tqdm import tqdm
11 from concurrent.futures import ThreadPoolExecutor
12 
13 # 队列
14 q = Queue(maxsize=300)
15 # 线程池
16 pool = ThreadPoolExecutor(max_workers=10)
17 
18 headers = {
19     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 '
20                   'Safari/537.36 '
21 }
22 
23 
24 # 线程池中的方法,解析URL将图片地址和标题放进队列
25 def utl_image(url, cursor):
26     # 1. 请求URL
27     resp = requests.get(url, headers=headers)
28     resp.encoding = resp.apparent_encoding
29     page_content = resp.text
30     # 2. 数据解析
31     tree = etree.HTML(page_content)
32     # 图片地址
33     hrefs = tree.xpath('//a[@class="preview"]/@href')
34     print(f'解析到图片{len(hrefs)}张!')
35     for a_href in tqdm(hrefs, desc='hrefs,插入队列'):
36         time.sleep(random.randint(1, 5))
37         resp_ah = requests.get(str(a_href), headers=headers)
38         resp_ah.encoding = resp_ah.apparent_encoding
39         pct = resp_ah.text
40         # 2. 数据解析
41         tre = etree.HTML(pct)
42         print(f'pct:{pct}')
43         img_list = tre.xpath('//img[@id="wallpaper"]')
44         print(f'线程{cursor},a_href:{a_href},img_list:{img_list}')
45         for img in img_list:
46             img_src = img.xpath('./@src')[0]
47             img_alt = img.xpath('./@alt')[0]
48             q.put([img_src, img_alt])
49 
50 
51 # 线程中的方法,获取队列中的图片地址和标题,进行存储
52 def image_save():
53     threadName = uuid.uuid1()
54     pCoun = 1
55     while True:
56         print(f'队列【消费】,当前队列消息总数:{q.qsize()}')
57         print(f'线程{threadName},开始消费第{pCoun}条消息!')
58         img_src, img_alt = q.get(timeout=180)
59         # 获取网页信息
60         r = requests.get(url=str(img_src), headers=headers)
61         path = f"wallhaven"
62         try:
63             if not os.path.exists(path):
64                 os.makedirs(path)
65         except:
66             print(f"创建目录失败{path}")
67         if r.status_code == 200:
68             print(f"{img_alt}下载[start]")
69             f = open(f"{path}/{img_alt[:40]}.jpg", 'wb')
70             f.write(r.content)
71             f.close()
72             print(f"{img_alt}下载[ok]")
73             time.sleep(random.randint(1, 3))
74         pCoun = pCoun+1
75 
76 
77 if __name__ == '__main__':
78     # 1. 准备数据
79     url = 'https://wallhaven.cc/search?categories=001&purity=100&ratios=landscape&topRange=1y&sorting=toplist&order=desc&ai_art_filter=1&page='
80     url_list = [f'{url}{i}' for i in range(1, 2)]  # 指定页区间,包头不包尾
81     i = 1
82     for ul in tqdm(url_list, desc='线程池'):
83         print(f'ul:{ul}')
84         # 使用线程池进行URL的解析
85         pool.submit(utl_image, ul, i)
86         time.sleep(1)
87         i = i + 1
88     # 使用多线程下载,数字几就是几个线程
89     for i in range(1):
90         t = threading.Thread(target=image_save)
91         t.start()

By the way:
4K-8k资源分享:https://www.cnblogs.com/kukuDF/p/15989961.html