PYTHON 简单的网页图片爬虫

发布时间 2023-09-07 09:48:29作者: seven1314pp

直接上代码:

'''
简单的网页图片爬虫   
要先安装requests,BeautifulSoup的库  
pip install requests
pip install bs4  是一个可以从HTML或XML文件中提取数据的Python库
pip install lxml
'''
import requests  #导入requests库
from bs4 import BeautifulSoup


def get_htmls(pages=list(range(2, 5))):
    #获取待爬取的网页
    pages_list = []
    for page in pages:
        url = f"https://pic.netbian.com/4kfengjing/index_{page}.html"  #网址
        response = requests.get(url)
        response.encoding = 'gbk'
        pages_list.append(response.text)
    return pages_list


def get_picturs(htmls):
    #获取所有图片,并下载
    for html in htmls:
        soup = BeautifulSoup(html, 'html.parser')  #解析html或xml
        # print(soup.prettify())  #把要解析的字符串以标准的缩进格式输出
        # print(soup.title.string)  #输出HTML中title节点的文本内容
        # print(soup.link.attrs)  #中间的link是页签?比如<link> <title> <head>
        # print(soup.link.attrs['href'])  #指定节点的数据

        pic_li = soup.find('div', id='main').find('div', class_='slist').find(
            'ul', class_='clearfix')
        image_path = pic_li.find_all('img')
        for file in image_path:
            pic_name = './partice05' + file['alt'].replace(" ", '_') + '.jpg'
            src = file['src']
            src = f"https://pic.netbian.com/{src}"

            response = requests.get(src)

            with open(pic_name, 'wb') as f:
                f.write(response.content)
                print("picturs dowmload in:{}".format(pic_name))


htmls = get_htmls(pages=list(range(2, 3)))  #得到网页的代码list
# print(htmls)
get_picturs(htmls)