Selenium 爬虫

发布时间 2023-12-25 18:31:21作者: 次林梦叶

image

from pixiv

相关资料

官网

官网使用介绍

可能会遇到的问题:selenium.common.exceptions.WebDriverException: Message: Can not connect to the Service chromedriver.exe while opening chrome browser

免责声明

本人承诺在使用爬虫程序时,仅出于学习和个人技术提升的目的,以了解互联网数据抓取和分析的工作原理。

本人郑重声明:

  1. 非商业用途:本人所使用的爬虫程序纯属个人学习之需,绝不用于商业目的或从事任何违法违规活动。

  2. 尊重网站规则:在使用爬虫程序时,本人将遵守所爬取网站的robots.txt协议或网站的访问规则,并尊重网站所有者对于爬虫程序的限制。

  3. 数据保护和隐私:本人承诺在爬取数据时不会获取、储存或利用任何个人隐私信息,且不会对所访问网站造成不必要的负担或干扰。

  4. 技术分享与合作:本人愿意遵守技术共享的原则,不会滥用所学知识,也愿意在合适的情况下与相关人士分享学习心得。

  5. 法律遵守:本人承诺遵守当地法律法规,并对于使用爬虫程序可能带来的风险和责任自行承担。

使用目的

这次我使用是为了爬取一个网站的数据,但是这个网站用普通的爬虫只能得到静态的网页,反爬做的比较好,于是我就找到了可以模拟点击浏览器的自动化工具Selenium

其可以通过代码自动打开浏览器,并进行点击,填写,跳转等操作,并可以通过内置方法获取HTML元素

代码

import traceback
from threading import Thread
import requests
from bs4 import BeautifulSoup
import time
import random
from helium import *
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd

chrome_driver_path = r'C:\Users\86150\Documents\chrome-win64\chrome-win64\chrome.exe'
starturl = 'url'


def findQuestion(html01):
    try:
        # 查找问题
        question = html01.find_element(By.CLASS_NAME, "com-mnks-question-detail") \
            .find_element(By.CLASS_NAME, 'timu') \
            .find_element(By.CLASS_NAME, 'timu-text')
        question = str(question.text)
        return question
    except Exception as ex:
        print(f"出现异常Question:f{ex}")
        return "NaN"


def findSelect(html01):
    try:
        # 查找选择
        selectOpt = html01.find_element(By.CLASS_NAME, "com-mnks-question-detail") \
            .find_element(By.CLASS_NAME, 'answer-w') \
            .find_element(By.CLASS_NAME, 'options-w') \
            .find_elements(By.TAG_NAME, 'p')
        selectlist = []
        for p_element in selectOpt:
            text = p_element.text
            selectlist.append(text)
        ans = "\n".join(selectlist)
        return ans
    except Exception as ex:
        print(f"出现异常Select:f{ex}")
        return "NaN"


def findImg(html01):
    try:
        # 查找图片
        imageInf = html01.find_element(By.CLASS_NAME, "com-mnks-question-detail") \
            .find_element(By.CLASS_NAME, 'answer-w') \
            .find_element(By.CLASS_NAME, 'media-w') \
            .find_element(By.TAG_NAME, 'img').get_attribute('src')
        return str(imageInf)
    except Exception as ex:
        print(f"No Img")
        return "NaN"


def findAns(html01, wait):
    try:
        ans_element = html01.find_element(By.CLASS_NAME, "com-shiti-xiangjie") \
            .find_element(By.CLASS_NAME, "xiangjie") \
            .find_element(By.CLASS_NAME, "content")
        wait.until(lambda d: ans_element.is_displayed())
        ans = str(ans_element.get_attribute("innerHTML"))
        if ans == "":
            ans = "Void"
        return ans
    except Exception as ex:
        print(f"出现异常Ans:f{ex}")
        return "NaN"


def getAns(html01):
    try:
        html01.find_element(By.CLASS_NAME, "tool-bar") \
            .find_element(By.CLASS_NAME, "btn-bar") \
            .find_element(By.CSS_SELECTOR, 'button.right.pt[ref="xiangqing"]').click()
    except Exception as ex:
        print(f'出现异常getAns:{ex}')


def nextOne(html01):
    try:
        html01.find_element(By.CLASS_NAME, "tool-bar") \
            .find_element(By.CLASS_NAME, "btn-bar") \
            .find_element(By.XPATH, "//button[text()='下一题']").click()
        return True
    except Exception as ex:
        print(f"出现异常nextOne:f{ex}")
        return False


def debug(df_train):
    try:
        print(df_train.shape)
        print(df_train.tail(1))
        print("***************************")
        print()
    except Exception as ex:
        print(f"出现异常DEBUG:{ex}")


def getData(url):
    startNum = 0
    df_train = pd.DataFrame({'question': [], 'selectOption': [],
                             'imgUrl': [], 'answer': []})
    # 打开主页
    driver = webdriver.Chrome()
    driver.get(url)
    wait = WebDriverWait(driver, timeout=10)
    time.sleep(10)
    while startNum < 1424:
        startNum = startNum + 1
        try:
            html01 = driver.find_element(By.CLASS_NAME, 'layout-article') \
                .find_element(By.CLASS_NAME, 'news-page') \
                .find_element(By.CLASS_NAME, 'left')
            wait.until(lambda d: html01.is_displayed())

            getAns(html01)

            html01 = driver.find_element(By.CLASS_NAME, 'layout-article') \
                .find_element(By.CLASS_NAME, 'news-page') \
                .find_element(By.CLASS_NAME, 'left')

            wait.until(lambda d: html01.is_displayed())

            a1 = findQuestion(html01)
            a2 = findSelect(html01)
            a3 = findImg(html01)
            a4 = findAns(html01, wait)
            '''
            print(a1)
            print(a2)
            print(a3)
            print(a4)
            '''
            tdf = pd.DataFrame({'question': [a1], 'selectOption': [a2],
                                'imgUrl': [a3], 'answer': [a4]})
            df_train = pd.concat([df_train, tdf])

            debug(df_train)

            flag = nextOne(html01)
            if not flag:
                break
        except Exception as ex:
            print("出现如下异常getData:%s" % ex)
            traceback.print_exc()
    driver.quit()
    return df_train


Data_MONI = getData(starturl)
Data_MONI.to_csv("D:\\moni_four-data")