【爬虫案例小结】

发布时间 2023-08-22 09:24:01作者: Chimengmeng

【案例】登陆博客园

【1】思路分析

  • 打开cnblogs
    • 点进登录页面
    • 输入用户名密码
    • 点登录(可能会出现验证码)----手动操作跳过验证码
  • 登录成功后
    • 拿到cookie
    • 保存到本地
    • 关闭浏览器
  • 开启selenium,打开浏览器
    • 把本地的cookie写入到当前浏览器中
    • 当前浏览器就是登录状态

【2】代码实现

from selenium import webdriver
from selenium.webdriver.common.by import By
import json

# 创建 Chrome 浏览器的实例
browser = webdriver.Chrome()

# 打开 cnblogs
browser.get('https://www.cnblogs.com/')

# 点击登录按钮,使用 XPATH 定位元素
btn_login = browser.find_element(By.XPATH, '//*[@id="navbar_login_status"]/a[6]')
btn_login.click()

# 输入登录用户名和密码
login_username = browser.find_element(By.XPATH, '//*[@id="mat-input-0"]')
login_password = browser.find_element(By.XPATH, '//*[@id="mat-input-1"]')

login_username.send_keys('206**46849@qq.com')
login_password.send_keys('zhaochunze521.')

# 点击登录按钮
login_btn = browser.find_element(By.XPATH,
                                 "/html/body/app-root/app-sign-in-layout/div/div/app-sign-in/app-content-container/div/div/div/form/div/button/span[1]")
login_btn.click()

# 登录成功后获取 cookie
cookies = browser.get_cookies()

# 保存 cookie 到本地文件
with open('cnblogs_cookie.json', 'w', encoding='utf-8') as f:
    json.dump(cookies, f)

# 关闭浏览器
browser.quit()

# 开启新的浏览器实例
browser = webdriver.Chrome()

# 打开 cnblogs 网页
browser.get('https://www.cnblogs.com/')

# 读取本地保存的 cookie
with open('cnblogs_cookie.json', 'r', encoding='utf-8') as f:
    cookies = json.load(f)

# 添加 cookie 到浏览器
for cookie in cookies:
    browser.add_cookie(cookie)

# 刷新页面,当前浏览器就处于登录状态
browser.refresh()

# 关闭浏览器
browser.close()

# 继续其他操作...
# ...

【案例】抽屉网半自动点赞

# -*-coding: Utf-8 -*-
# @File : 03chouti .py
# author: Chimengmeng
# blog_url : https://www.cnblogs.com/dream-ze/
# Time:2023/8/19
import json
import threading

import requests
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from lxml import etree


class ChouTi():
    def __init__(self):
        self.headers = {
            'User-Agent': UserAgent().random,
        }
        self.browser = webdriver.Edge()

    def get_link_id(self):
        link_id_list = []
        response = requests.get('https://dig.chouti.com/', headers=self.headers)
        tree = etree.HTML(response.text)
        div_list = tree.xpath('/html/body/main/div/div/div[1]/div/div[2]/div[1]/div')
        for div in div_list:
            link_id = div.xpath("@data-id")[0]
            link_id_list.append(link_id)

        return link_id_list

    def get_cookies(self, browser):
        # 获取Cookie
        btn_login = browser.find_element(By.XPATH, '//*[@id="login_btn"]')
        # btn_login.click()
        browser.execute_script("arguments[0].click()", btn_login)

        # self.browser.implicitly_wait(3)
        # from_username_to_login = self.browser.find_element(By.XPATH, '/html/body/div[4]/div/div[2]/a[2]')
        # from_username_to_login.click()
        # self.browser.implicitly_wait(3)
        # username_input = self.browser.find_element(By.XPATH, '/html/body/div[4]/div/div[3]/div[2]/div/input')
        # password_input = self.browser.find_element(By.XPATH, '/html/body/div[4]/div/div[4]/div[1]/div/input[1]')

        browser.implicitly_wait(3)
        username_input = browser.find_element(By.XPATH, '/html/body/div[4]/div/div[3]/div[1]/div[2]/input')
        password_input = browser.find_element(By.XPATH, '/html/body/div[4]/div/div[4]/div[1]/div/input[1]')
        username_input.send_keys('')
        password_input.send_keys('')

        browser.implicitly_wait(3)
        login_btn = browser.find_element(By.XPATH, '/html/body/div[4]/div/div[4]/div[4]/button')
        # login_btn.click()
        browser.execute_script("arguments[0].click()", login_btn)
        browser.implicitly_wait(3)

        cookies = browser.get_cookies()

        with open('chouti_cookies.json', 'w', encoding='utf-8') as fp:
            json.dump(cookies, fp)

        browser.close()

    def up_blog(self, real_cookie, link_id):
        # 缺cookie,如果有了cookie,可以整个页面全点一遍
        data = {
            'linkId': link_id
        }
        response = requests.post('https://dig.chouti.com/link/vote', headers=self.headers, data=data,
                                 cookies=real_cookie)
        if response.status_code == 200:
            print(f'link_id:>>>点赞成功')

    def get_news(self):
        ...

    def main_up(self):
        link_id_list = self.get_link_id()
        self.browser.get('https://dig.chouti.com/')
        self.browser.implicitly_wait(3)
        self.get_cookies(self.browser)
        real_cookie = {}
        with open('chouti_cookies.json', 'r', encoding='utf-8') as fp:
            cookies = json.load(fp)

        for item in cookies:
            real_cookie[item['name']] = item['value']

        task_list = []
        for link_id in link_id_list:
            task = threading.Thread(target=self.up_blog, args=(real_cookie, link_id_list))
            task.start()
            task_list.append(task)
        for task in task_list:
            task.join()

    def main(self):
        ...


if __name__ == '__main__':
    ChouTi().main_up()

【案例】滑动验证

import time
from selenium import webdriver
from selenium.webdriver.common.by import By  # 按照什么方式查找,By.ID,By.CSS_SELECTOR
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait  # 等待页面加载某些元素
import cv2

from urllib import request
from selenium.webdriver.common.action_chains import ActionChains


def get_distance():
    background = cv2.imread("background.png", 0)
    gap = cv2.imread("gap.png", 0)

    res = cv2.matchTemplate(background, gap, cv2.TM_CCOEFF_NORMED)
    value = cv2.minMaxLoc(res)[2][0]
    print(value)
    return value * 278 / 360


def main():
    chrome = webdriver.Chrome()
    chrome.implicitly_wait(5)

    chrome.get('https://passport.jd.com/new/login.aspx?')

    login = chrome.find_element(By.CLASS_NAME, 'login-tab-r')
    login.click()

    loginname = chrome.find_element(By.ID, 'loginname')
    loginname.send_keys("123@qq.com")

    nloginpwd = chrome.find_element(By.ID, 'nloginpwd')
    nloginpwd.send_keys("987654321")

    loginBtn = chrome.find_element(By.CLASS_NAME, 'login-btn')
    loginBtn.click()

    img_src = chrome.find_element(By.XPATH, '//*[@class="JDJRV-bigimg"]/img').get_attribute("src")
    temp_src = chrome.find_element(By.XPATH, '//*[@class="JDJRV-smallimg"]/img').get_attribute("src")
    request.urlretrieve(img_src, "background.png")
    request.urlretrieve(temp_src, "gap.png")

    distance = int(get_distance())
    print("distance:", distance)

    print('第一步,点击滑动按钮')
    element = chrome.find_element(By.CLASS_NAME, 'JDJRV-slide-btn')
    ActionChains(chrome).click_and_hold(on_element=element).perform()  # 点击鼠标左键,按住不放

    ActionChains(chrome).move_by_offset(xoffset=distance, yoffset=0).perform()
    ActionChains(chrome).release(on_element=element).perform()


if __name__ == '__main__':
    main()

【案例】登陆12306

import time
from selenium.webdriver import ActionChains
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

# 创建 Chrome 浏览器选项对象
options = Options()

# 去掉自动化控制的特性
options.add_argument("--disable-blink-features=AutomationControlled")

# 实例化 Chrome 浏览器
bro = webdriver.Chrome(options=options)

# 打开 12306 网站登录页面
bro.get('https://kyfw.12306.cn/otn/resources/login.html')

# 隐式等待
bro.implicitly_wait(5)

# 最大化窗口
bro.maximize_window()

# 点击选择扫码登录方式
user_login = bro.find_element(By.CSS_SELECTOR,
                              '#toolbar_Div > div.login-panel > div.login-box > ul > li.login-hd-code.active > a')
user_login.click()
time.sleep(1)

# 输入用户名和密码,并点击登录按钮
username = bro.find_element(By.ID, 'J-userName')
password = bro.find_element(By.ID, 'J-password')
submit_btn = bro.find_element(By.ID, 'J-login')

# 修改下方的用户名和密码为正确的信息
username.send_keys('1**53675221')
password.send_keys('')
time.sleep(3)
submit_btn.click()

time.sleep(5)

# 找到滑块并进行拖动操作
span = bro.find_element(By.ID, 'nc_1_n1z')
ActionChains(bro).click_and_hold(span).perform()
ActionChains(bro).move_by_offset(xoffset=300, yoffset=0).perform()
ActionChains(bro).release().perform()
time.sleep(5)

# 关闭浏览器窗口
bro.close()

【案例】登录超级鹰

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from chaojiying import ChaojiyingClient
from PIL import Image

# 初始化浏览器
bro = webdriver.Chrome()

# 打开网页
bro.get('http://www.chaojiying.com/apiuser/login/')
bro.implicitly_wait(10)
bro.maximize_window()

try:
    # 输入用户名和密码
    username = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input')
    password = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input')
    code = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input')
    btn = bro.find_element(by=By.XPATH, value='/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input')
    username.send_keys('306334678')
    password.send_keys('lqz123')

    # 获取验证码图片
    # 将整个页面截图保存为 main.png
    bro.save_screenshot('main.png')  
    # 定位验证码图片元素
    img = bro.find_element(By.XPATH, '/html/body/div[3]/div/div[3]/div[1]/form/div/img') 
    # 获取验证码图片在页面中的位置
    location = img.location  
    # 获取验证码图片的尺寸
    size = img.size  

    # 使用Pillow库裁剪出验证码图片
    img_tu = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height']))
    # 打开整个页面截图
    img = Image.open('./main.png')  
    # 裁剪出验证码图片
    fram = img.crop(img_tu) 
    # 保存验证码图片
    fram.save('code.png')  

    # 使用超级鹰识别验证码
    # 初始化超级鹰账号信息
    chaojiying = ChaojiyingClient('', '', '')  
    # 读取验证码图片
    im = open('code.png', 'rb').read()  
    # 使用超级鹰识别验证码图片
    res_code = chaojiying.PostPic(im, 1902)['pic_str']  
    # 输入识别结果到验证码输入框
    code.send_keys(res_code)  

    time.sleep(5)
    btn.click()
    time.sleep(10)
except Exception as e:
    print(e)
finally:
    bro.close()

【案例】抓京东商品信息

import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys  # 键盘按键操作
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.edge.options import Options


def get_goods(bro):
    # 往下滑动一下屏幕
    bro.execute_script('scrollTo(0,5000)')
    goods = bro.find_elements(By.CLASS_NAME, 'gl-item')
    print(len(goods))
    for good in goods:
        try:
            price = good.find_element(By.CSS_SELECTOR, 'div.p-price i').text
            url = good.find_element(By.CSS_SELECTOR, 'div.p-img>a').get_attribute('href')
            commit = good.find_element(By.CSS_SELECTOR, 'div.p-commit a').text
            name = good.find_element(By.CSS_SELECTOR, 'div.p-name em').text
            img = good.find_element(By.CSS_SELECTOR, 'div.p-img img').get_attribute('src')
            if not img:
                img = 'https:' + good.find_element(By.CSS_SELECTOR, 'div.p-img img').get_attribute('data-lazy-img')
            print('''
            商品名字:%s
            商品价格:%s
            商品评论:%s
            商品图片:%s
            商品链接:%s
            ''' % (name, price, commit, img, url))
        except Exception as e:
            print(e)
            continue

    # 找出下一页按钮,点击
    next = bro.find_element(By.PARTIAL_LINK_TEXT, '下一页')
    next.click()
    get_goods(bro)  # 递归调用


options = Options()
options.add_argument("--disable-blink-features=AutomationControlled")  # 去掉自动化控制
# bro = webdriver.Chrome(options=options)  # 在新版本的Selenium中,'chrome_options'已被替换为'options'
bro = webdriver.Edge(options=options)  # 在新版本的Selenium中,'chrome_options'已被替换为'options'
bro.get('https://www.jd.com/')
bro.maximize_window()
bro.implicitly_wait(10)
try:
    search_input = bro.find_element(By.ID, 'key')
    search_input.send_keys('mac pro')
    search_input.send_keys(Keys.ENTER)
    # search_input.send_keys(Keys.BACKSPACE)
    get_goods(bro)
except Exception as e:
    print(e)
finally:
    bro.close()