如何使用多线程爬取小说

发布时间 2023-09-21 16:12:30作者: XYDYX
import os
from multiprocessing.dummy import Pool
import time

import requests
import re

def get_url(url):
html = requests.get(url)
return html.content.decode('ANSI')

# 得到首页目录源代码
origin = 'https://www.tianyabook.com/shu/4335.html'
html = get_url(origin)
#print(html)

# 从源代码中读取目录的 地址
result = re.findall('<dd class="col-md-3"><a href="(.*?)" title="',html,re.S)
#print(result)

#在源代码前加上 https://www.tianyabook.com ,生成各章节完整的地址信息
url_list = ['https://www.tianyabook.com' + str(i) for i in result]
#print(url_list) # 地址
mulu_list = re.search('<dd class="col-md-3">(.*?)</dl>',html,re.S).group(0)
title1 = re.findall('title="(.*?)</dd>',mulu_list,re.S)
title = "".join(title1) #将列表合并为一个字符串
title_list = re.findall('>(.*?)</a>',title,re.S)
#print(title_list) # 目录名称

'''
#此时可以爬取一章页面信息
html_con = requests.get(url_list[1]).content.decode("ansi") #获得章节页面
#print(html_con)
text_con = re.findall('</b>即可找到本书最新章节.</p>(.*?)<p class="booktag">',html_con,re.S)
text = text_con[0].replace('&nbsp;&nbsp;&nbsp;&nbsp;', '').replace('<br />', '') # 处理替换多余的字符
print(text)
'''
#使用用多线程读取各章节信息
def getContent(url):
html_con = requests.get(url).content.decode('ansi')
text_con = re.findall('</b>即可找到本书最新章节.</p>(.*?)<p class="booktag">',html_con,re.S)
text = text_con[0].replace('&nbsp;&nbsp;&nbsp;&nbsp;', '').replace('<br />', '') # 处理替换多余的字符
return text_con
pool = Pool(5) #此处使用5线程对小说进行爬取
pool.map(getContent, url_list)

# 写入文件
def writeToTxt(url_list,title_list) :
for i in range(len(url_list)):
url = url_list[i]
title = title_list[i]
content_list = getContent(url) #读取某一章节信息
content = "".join(content_list)
path = './动物农场'

#createPath(path)
f = open(path+'/'+title+'.txt','w',encoding='utf')
f.write(content)

writeToTxt(url_list,title_list)