python爬虫示例-2

发布时间 2024-01-08 20:41:27作者: 右眼与明天
import time
import os
import requests as re
from tqdm import tqdm
from bs4 import BeautifulSoup

download_src = "https://m.tuiimg.com/"  #网站url
now_file =os.getcwd()   #当前文件的路径

def create_file(path):
    if not os.path.exists(path):
        os.makdir(path)
    else:
        print('文件已存在')
    path = os.path.abspath(path)
    return path

def download(path,download_src):
    file_path = re.get(download_src,timeout=5)
    file_path.encoding = file_path.apparent_encoding
    soup = BeautifulSoup(file_path.text,'html.parser')  #格式化页面
    li = soup.find('ul',{'id':'main','class':'main'}).find_all('li')  #匹配出所有的li标签
    img = []    #赋值一个空列表后期传递图片地址使用
    src = []
    for list in range(0,len(li)) :  #遍历所有li标签的值
        list_img= li[list].find('img')   #筛选出li标签里面的img标签
        img.append(list_img)    #将img标签添加到Img列表里面
        # print(img)
        if img[list].attrs['src'] != None:  #如果遍历的内容不为none的话则赋值给,
            download_url_1 = img[list].attrs['realsrc']   #匹配出所有的img的src
            src.append(download_url_1)   #添加到src列表中    
    # print(src)    
    
    #获取页面的图片数量信息
    pdar = tqdm(src,ncols=100,desc='文件下载进度',colour='#96b97d')
    for x in pdar:
        img_url = 'https://m.tuiimg.com/meinv/'+ x[-11:-6]  #拼接到图片的访问地址然后去请求url,从而下载图片
        list_img_url = re.get(img_url)
        list_img_url.encoding = list_img_url.apparent_encoding
        img_soup = BeautifulSoup(list_img_url.text,'html.parser')
        i = img_soup.find('span',{'class':'all'}).find('i',{'id':'allbtn'})  #提取出对应文章的url里面的limit标签信息,从而知道页面有多少图片
        img_num = str(i)[-7:-5]  #取到对应页面的图片的里面具体图片limit数量
        Folder_path = path +'/'+ x[-11:-6]    #创建文件夹名
        if not os.path.exists(Folder_path):
            os.mkdir(Folder_path)
            for i in range(1,int(img_num)) :
                download_img = x[:-6] + str(i) + '.jpg'   #拼接url路径并下载文件
                Img_name = f'{str(i)}.jpg'  #文件的具体名称
                try : 
                    down_img = re.get(download_img,timeout=5)
                except :
                    continue
                img_name_path = os.path.join(Folder_path,Img_name)
                # img_name = Img_path +str(i)+'.jpg'  #对应的图片的路径
                with open (img_name_path,'wb') as fp :
                    fp.write(down_img.content)
        time.sleep(0.1)

               
        # # #具体的图片下载
    
        '''for i in range(1,int(img_num)) :
            download_img = x[:-6] + str(i) + '.jpg'
            # print(download_img)
            try : 
                down_img = re.get(download_img,timeout=5)
            except :
                continue
            img_name = img_src_path +str(i)+'.jpg'  #对应的图片的路径
            with open (img_name,'wb') as fp :
                fp.write(down_img.content)  '''    


if __name__ == "__main__":    
    file_path = create_file(now_file)
    print(f'当前文件路径{file_path}')

    download(file_path,download_src)