面向对象编程1_药品

发布时间 2023-04-23 22:53:30作者: 不同凡响的太阳

面向对象编程一:

import requests
import re
import pandas as pd
from time import sleep
import os

os.environ['NO_PROXY'] = 'www.baidu.com'


class Spider():

    def __init__(self, pages):
        self.__pages = pages
        self.__count = 0
        self.__total_list = []
        self.__session = requests.session()

    def start(self):
        self.__face()
        self.__get_info()
        self.__save_excel()
        self.__end()

    def __get_info(self):
        """
        获取网页信息
        """
        for page in range(1, self.__pages + 1):
            sleep(1.5)
            # for page in range(300, pages + 1):
            print("正在获取%d页" % page)

            headers = {
                'authority': 'go.drugbank.com',
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
                'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
                'cache-control': 'no-cache',
                'cookie': 'cf_clearance=z8jXLY4NjL4.KVOUbgZNPWj6NPBlT_u.x4xmS19uZZE-1682037487-0-250; _ga=GA1.1.1772772602.1682037506; _clck=qmfr9z|1|fay|0; _gcl_au=1.1.154779136.1682037508; ln_or=eyIyNDI4NDg0IjoiZCJ9; cf_chl_rc_i=3; __hstc=49600953.15b5c265b1847afab42a7def948ef734.1682037569169.1682037569169.1682037569169.1; hubspotutk=15b5c265b1847afab42a7def948ef734; __hssrc=1; __hssc=49600953.1.1682037569170; _omx_drug_bank_session=h5rNQehFjtC7SdQpb0HXfv7YA4pMfk2aMTWYUSzMaI7nF0SbDXVfqfqjVPMDQTs2hgXfMvwxMUByy%2BKdrPb40gDq3sAu%2BlSnJlbkcvWaBL7%2FIXcG6c2pOsEpbjbnGW8MV%2FkzX9dBeVriL%2F%2Fu%2FqTFEy3yXHyFPH38kRTSlMsmGlhgEh4CixxpIMBRiVUNeT7y6DeBVsL%2BNsjwWPFsiDVkWI7a3jOWuyeBxRDUZ9wUC8Opi%2FAiuvt2uVIehjQ2v4MNPGh%2FeyHirey%2FXQB34RHcTcb1fSRyNfDm4HEGYVzHI%2FkI%2BwIx8yHe24IlAFsEVVMW08bZrMA4b7Xf8G%2FLYLW2uRIr9qtlwELL7f07%2BoGtFMGnwylLzS97w0dEETEjdhyCogDlU65RZNcfbQMykTItXTWUBUskVMiNqDqDL%2BWU--uvjh%2BEo9E6nxj%2FdW--jEPCVVPyedUcF9oOBkYjpA%3D%3D; _ga_DDLJ7EEV9M=GS1.1.1682037506.1.1.1682037593.0.0.0; _clsk=1mtfaxl|1682037594214|5|1|u.clarity.ms/collect',
                'referer': 'https://go.drugbank.com/unearth/q?query=*&button=&searcher=drugs',
                'sec-ch-ua': '"Chromium";v="112", "Microsoft Edge";v="112", "Not:A-Brand";v="99"',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
            }

            params = {
                'page': str(page),
                'query': '*',
                'searcher': 'indications'
            }

            response = self.__session.get('https://xx',
                                          headers=headers, params=params).text
            try:
                info1 = re.findall(r'href="/indications/.*?">(.*?)</a', response)  # 第一列表
                if not info1:
                    print(f"最大限度{page}页")
                    return
            except Exception as e:
                print(f"最大限度{page}页,error={e}")
                return

            info2 = re.findall(r'<div class="db-matches"><a (.*?)</a></div>', response)
            info2_new = []  # 第二列表
            for i in info2:
                i = i.replace('href="/drugs/', '').replace('">', ':').replace('</a>', '').replace('<a', '').replace(
                    ' / ',
                    '【/】')

                # print(i)
                info2_new.append(i)

            for yaoming, chenfen in zip(info1, info2_new):
                dic = {
                    "疾病": yaoming,
                    "成分": chenfen
                }
                self.__total_list.append(dic)
                self.__count += 1
                print(dic)
        # return count

    def __save_excel(self):
        """
        保存到excel
        """
        pf = pd.DataFrame(self.__total_list)  # 转列表为DataFrame
        path = pd.ExcelWriter('DrugBank_test.xlsx')
        pf.to_excel(path, encoding='utf-8', index=False)  # 转化为Excel
        path.save()  # 保存

    def __face(self):
        title = 'DrugBank采集开始启动'
        width = 140
        print('#' * width)
        print('##', " " * (width - 6), '##')
        print('##', " " * (width - 6), '##')
        print('##', title.center(width - len(title) * 2 + 17), '##')
        print('##', " " * (width - 6), '##')
        print('##', " " * (width - 6), '##')
        print('#' * width)

    def __end(self):
        print("爬取完毕")
        print(self.__count)


if __name__ == '__main__':
    pages = int(input("输入页数:"))
    spider = Spider(pages)
    spider.start()