不生产小说,只做网站的搬运工,太牛逼了~(附源码)
源码
import requests
from lxml import etree
url = "https://www.biduo.cc/biquge/40_40847/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
class Spider(object):
def detail_request(self):
# 1. 请求目录拿到HTML数据,抽取章名、章链接
response = requests.get(url,headers=headers)
# print(response.content.decode('gbk'))
html = etree.HTML(response.content.decode('gbk')) # 整理文档对象
# print(html)
tit_list = html.xpath('//*[@id="list"]/dl/dd/a/text()')
url_list = html.xpath('//*[@id="list"]/dl/dd/a/@href')
print(tit_list,url_list)
for tit, src in zip(tit_list, url_list):
self.content_request(tit, src)
def content_request(self, tit, src):
# 2. 请求文章拿到HTML数据,抽取文章内容并保存
response = requests.get(url + src,headers=headers)
html = etree.HTML(response.content.decode('gbk')) # 整理文档对象
content = "\n".join(html.xpath('//*[@id="content"]/text()'))
file_name = tit + ".txt"
print("正在保存文件:{}".format(file_name))
with open(file_name, "a", encoding="utf-8") as f:
f.write(content)
spider = Spider()
spider.detail_request()
效果展示
END
原文公众号:Python顾木子吖