爬虫记录~(多线程爬取图片)

发布时间 2023-09-28 23:56:05作者: yyiiing

使用Requests+Re库方法多线程爬取亚马逊商城商品图片,以关键词“书包”搜索页面的商品的图片,爬取0-2页面商品图片。

关键词:多线程爬虫程序、商城网站的遍历,链接的查找和访问。巩固搜索接口和翻页处理。

import requests
from fake_useragent import UserAgent
import re
from multiprocessing.dummy import Pool as ThreadPool
import time

#获取图片数据
def get_pagelist(pagenum):
    url="https://www.amazon.cn/s?k=%E4%B9%A6%E5%8C%85&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&crid=1RAID9NTPCARM&sprefix=%E4%B9%A6%E5%8C%85%2Caps%2C154&ref=nb_sb_noss_1"
    headers={'User-Agent':UserAgent().random}
    data={
        'block_last_random': 'custom',
        'block_custom': '54',
        'action': 'list_publications',
        'page': pagenum,
    }
    response=requests.post(url=url,data=data,headers=headers,timeout=8)
    html=response.content.decode('utf-8')
    datamedias=re.findall(r'<img class="s-image" src="(.*?)"',html,re.S)
    #print(len(datamedias))
    return datamedias

#下载图片数据
def dowm(imgurl):
    imgname=imgurl.split("/")[-1]
    headers = {'User-Agent': UserAgent().random}
    r=requests.get(url=imgurl,headers=headers,timeout=8)
    with open(f'D:\DATASET\img\{imgname}','wb') as f:
        f.write(r.content)
    print(f'{imgname} 图片下载成功了!')

#多线程下载图片数据
def thread_down(imgs):
        pool = ThreadPool()
        results = pool.map(dowm, imgs)
        pool.close()
        pool.join()
        print("采集所有图片完成!")

#主程序
def main():
    start_time = time.time()
    for i in range(0,3):
        print(f'正在爬取采集第 {i} 页图片数据..')
        imgs=get_pagelist(i)
        thread_down(imgs)
    end_time = time.time()
    print("time taken:",end_time - start_time, "seconds")

if __name__=='__main__':
    main()

代码参考~ https://cloud.tencent.com/developer/article/1826531