【爬虫】多线程下载文件

发布时间 2023-11-20 22:56:25作者: GGBomb
import requests      
import json
from lxml import etree
from concurrent.futures import ThreadPoolExecutor         导入多线程所需要的库

def mians(num):
    url=f"http://www.1o1o.xyz/ctfarticle.asp?offset={num}"
    domain="http://www.1o1o.xyz/"
    res=requests.get(url)
    res.encoding="gb2312"
    print(res.text)
    et=etree.HTML(res.text)
    pdf_url=et.xpath("//th/a/@href")[0:-1]
    pdf_name=et.xpath("//th/a/text()")[0:-2]
    print(pdf_url)
    print(pdf_name)
    with ThreadPoolExecutor(30) as t:         开启线程池,自动处理线程开启和关闭
        for i in range(30):
            iamurl=domain+pdf_url[i]
            name=pdf_name[i]
            t.submit(dow,iamurl,name)
    print("下载完成")        


'''for i in range(30):
        data=requests.get(domain+pdf_url[i])
        pdf_content=data.content
        name=pdf_name[i]
        with open(f"E:\WP\{name}",'wb') as f:
            f.write(pdf_content)'''


def dow(imaurl,name):
    data=requests.get(imaurl)
    pdf_content=data.content
    with open(f"E:\WP\{name}",'wb') as f:
        f.write(pdf_content)


if __name__ == '__main__':
    for num in range(0,12451,30):
        print(num)
        mians(num)