面向对象编程一:
import requests
import re
import pandas as pd
from time import sleep
import os
os.environ['NO_PROXY'] = 'www.baidu.com'
class Spider():
def __init__(self, pages):
self.__pages = pages
self.__count = 0
self.__total_list = []
self.__session = requests.session()
def start(self):
self.__face()
self.__get_info()
self.__save_excel()
self.__end()
def __get_info(self):
"""
获取网页信息
"""
for page in range(1, self.__pages + 1):
sleep(1.5)
# for page in range(300, pages + 1):
print("正在获取%d页" % page)
headers = {
'authority': 'go.drugbank.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'cache-control': 'no-cache',
'cookie': 'cf_clearance=z8jXLY4NjL4.KVOUbgZNPWj6NPBlT_u.x4xmS19uZZE-1682037487-0-250; _ga=GA1.1.1772772602.1682037506; _clck=qmfr9z|1|fay|0; _gcl_au=1.1.154779136.1682037508; ln_or=eyIyNDI4NDg0IjoiZCJ9; cf_chl_rc_i=3; __hstc=49600953.15b5c265b1847afab42a7def948ef734.1682037569169.1682037569169.1682037569169.1; hubspotutk=15b5c265b1847afab42a7def948ef734; __hssrc=1; __hssc=49600953.1.1682037569170; _omx_drug_bank_session=h5rNQehFjtC7SdQpb0HXfv7YA4pMfk2aMTWYUSzMaI7nF0SbDXVfqfqjVPMDQTs2hgXfMvwxMUByy%2BKdrPb40gDq3sAu%2BlSnJlbkcvWaBL7%2FIXcG6c2pOsEpbjbnGW8MV%2FkzX9dBeVriL%2F%2Fu%2FqTFEy3yXHyFPH38kRTSlMsmGlhgEh4CixxpIMBRiVUNeT7y6DeBVsL%2BNsjwWPFsiDVkWI7a3jOWuyeBxRDUZ9wUC8Opi%2FAiuvt2uVIehjQ2v4MNPGh%2FeyHirey%2FXQB34RHcTcb1fSRyNfDm4HEGYVzHI%2FkI%2BwIx8yHe24IlAFsEVVMW08bZrMA4b7Xf8G%2FLYLW2uRIr9qtlwELL7f07%2BoGtFMGnwylLzS97w0dEETEjdhyCogDlU65RZNcfbQMykTItXTWUBUskVMiNqDqDL%2BWU--uvjh%2BEo9E6nxj%2FdW--jEPCVVPyedUcF9oOBkYjpA%3D%3D; _ga_DDLJ7EEV9M=GS1.1.1682037506.1.1.1682037593.0.0.0; _clsk=1mtfaxl|1682037594214|5|1|u.clarity.ms/collect',
'referer': 'https://go.drugbank.com/unearth/q?query=*&button=&searcher=drugs',
'sec-ch-ua': '"Chromium";v="112", "Microsoft Edge";v="112", "Not:A-Brand";v="99"',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
}
params = {
'page': str(page),
'query': '*',
'searcher': 'indications'
}
response = self.__session.get('https://xx',
headers=headers, params=params).text
try:
info1 = re.findall(r'href="/indications/.*?">(.*?)</a', response) # 第一列表
if not info1:
print(f"最大限度{page}页")
return
except Exception as e:
print(f"最大限度{page}页,error={e}")
return
info2 = re.findall(r'<div class="db-matches"><a (.*?)</a></div>', response)
info2_new = [] # 第二列表
for i in info2:
i = i.replace('href="/drugs/', '').replace('">', ':').replace('</a>', '').replace('<a', '').replace(
' / ',
'【/】')
# print(i)
info2_new.append(i)
for yaoming, chenfen in zip(info1, info2_new):
dic = {
"疾病": yaoming,
"成分": chenfen
}
self.__total_list.append(dic)
self.__count += 1
print(dic)
# return count
def __save_excel(self):
"""
保存到excel
"""
pf = pd.DataFrame(self.__total_list) # 转列表为DataFrame
path = pd.ExcelWriter('DrugBank_test.xlsx')
pf.to_excel(path, encoding='utf-8', index=False) # 转化为Excel
path.save() # 保存
def __face(self):
title = 'DrugBank采集开始启动'
width = 140
print('#' * width)
print('##', " " * (width - 6), '##')
print('##', " " * (width - 6), '##')
print('##', title.center(width - len(title) * 2 + 17), '##')
print('##', " " * (width - 6), '##')
print('##', " " * (width - 6), '##')
print('#' * width)
def __end(self):
print("爬取完毕")
print(self.__count)
if __name__ == '__main__':
pages = int(input("输入页数:"))
spider = Spider(pages)
spider.start()