批量爬取多分页多张图片

发布时间 2023-10-01 17:07:26作者: sgj191024
import urllib.request
from lxml import etree

# https://sc.chinaz.com/tupian/siwameinvtupian.html
url = 'https://sc.chinaz.com/tupian/siwameinvtupian_2.html'

def getTenGirlPhote(page):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62',
    }
    url = ''
    if(page == 1):
        url = 'https://sc.chinaz.com/tupian/siwameinvtupian.html'
    else:
        url = 'https://sc.chinaz.com/tupian/siwameinvtupian_' + str(page) + '.html'

    request = urllib.request.Request(url=url, headers=headers)
    res = urllib.request.urlopen(request)
    content = res.read().decode('utf-8')
    #print(content)
    tree = etree.HTML(content)
    src = tree.xpath('//div[@class="tupian-list com-img-txt-list"]/div/img//@data-original')
    name = tree.xpath('//div[@class="tupian-list com-img-txt-list"]/div/img//@alt')
    # print(name)

    print(len(src))
    for i in range(len(src)):
        # 0-39
        #print(i)
        imgUrl = 'https:' + src[i]
        print(imgUrl)
        urllib.request.urlretrieve(imgUrl,'./imgs/' + name[i] + '.jpg')

for i in range(1,11):
    getTenGirlPhote(i)