Selenium-[实例]猫眼电影爬取

发布时间 2023-07-17 11:35:08作者: 蕝戀
import random
import time

from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By

DIRVER_PATH = r'C:\Users\Administrator\Desktop\chromedriver.exe'
# 跳过selenium检测
STEALTH_JS = r'C:\Users\Administrator\Desktop\stealth.min.js'


def main():
    service = ChromeService(executable_path=DIRVER_PATH)

    options = webdriver.ChromeOptions()

    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)

    # 不退出浏览器
    # options.add_experimental_option('detach', True)

    # 防止检测到selenium
    options.add_argument("--disable-blink-features")
    options.add_argument("--disable-blink-features=AutomationControlled")

    driver = webdriver.Chrome(service=service, options=options)

    with open(STEALTH_JS) as f:
        js = f.read()

    driver.execute_cdp_cmd(
        cmd="Page.addScriptToEvaluateOnNewDocument",
        cmd_args={
            "source": js
        }
    )

    # 打开一个标签页
    # driver.get("https://bot.sannysoft.com/")
    driver.get("https://www.maoyan.com/films?showType=3")

    # 最大化
    driver.maximize_window()

    # 隐式等待最长时间:5秒
    driver.implicitly_wait(5)

    while next_page_ele := driver.find_element(By.LINK_TEXT, "下一页"):

        elements = driver.find_elements(By.XPATH,'//div[@class="movies-list"]/dl/dd/div[@class="movie-item film-channel"]')

        for index, element in enumerate(elements, 1):
            print("点击电影item元素")
            ActionChains(driver).move_to_element(element).click().perform()
            print("切换到最后的标签页")
            driver.switch_to.window(driver.window_handles[-1])

            print("查找评分的元素")
            try:
                # 评分是字体加密的,这时候就可以利用针对元素进行截图,然后丢给OCR识别
                rate_ele = driver.find_element(By.XPATH, "//span[@class='index-left info-num ']/span")
                # 丢给OCR识别
            except:
                print("暂无评分")
            else:
                print("可以获取到评分元素")
            # 关闭标签页
            driver.close()
            # 回到原来的页面
            driver.switch_to.window(driver.window_handles[0])
            # 休息一下
            time.sleep(random.randint(1, 3))

        print("点击下一页")
        #ActionChains(driver,).move_to_element(next_page_ele).click().perform()
        next_page_ele.click()

    time.sleep(10)
    driver.quit()


if __name__ == '__main__':
    main()