如何用python做一个exe程序快速爬取文章？-526互联

我用了99藏书网作为例子

注：本程序主要用于快速复制99藏书网中的小说，有些参数我要在开头先解释清楚

一、导入库

import tkinter as tk
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

二、做一个背景函数

这个背景函数主要是用于GUI可视化背景的设置

def set_background_image(window, image_path):
    # 创建一个PhotoImage对象
    background_image = tk.PhotoImage(file=image_path)

    # 将PhotoImage对象设置为窗口的背景
    background_label = tk.Label(window, image=background_image)
    background_label.place(x=0, y=0, relwidth=1, relheight=1)

    # 保存PhotoImage对象到窗口属性中，避免被垃圾回收
    window.background = background_image

三、做GUI主页面

app = tk.Tk()

app.title("九九藏书网")
app.geometry('1920x1080')
set_background_image(app,"哆唻A梦.png")

# 添加一个Text组件用于显示爬取的内容
text = tk.Text(app, width=50, height=40, font=('Arial', 12), wrap=tk.WORD)
text.place(x=10, y=10)

lb = tk.Label(app, text='欢迎来到“九九藏书网”书籍采集获取系统', width=40, height=1, fg='black', font=('Arial', 18))
lb.pack()

bt = tk.Button(app, text="开始", width=10, command=main, activebackground='red')
bt.pack()

app.mainloop()

四、做GUI次级界面

def main():
    input_window = tk.Toplevel(app)
    input_window.geometry(app.geometry())
    set_background_image(input_window, "哆唻A梦.png")
    lb1 = tk.Label(input_window, text='请输入书本号', width=40, height=10, fg='black', font=8)
    lb1.place(x=1, y=1)
    entry1 = tk.Entry(input_window)
    entry1.place(x=130, y=220)
    lb2 = tk.Label(input_window, text='请输入前言的章节号', width=40, height=10, fg='black', font=8)
    lb2.place(x=1, y=250)
    entry2 = tk.Entry(input_window)
    entry2.place(x=130, y=470)
    lb3 = tk.Label(input_window, text='请输入最后一个章节的章节号', width=40, height=10, fg='black', font=8)
    lb3.place(x=1, y=500)
    entry3 = tk.Entry(input_window)
    entry3.place(x=130, y=720)
    confirm_button = tk.Button(input_window, text="确认", command=lambda: process_input1(entry1.get(), entry2.get(), entry3.get()))
    confirm_button.place(x=180, y=750)

五、写一个爬虫函数

先找到存储正文的模块

def process_input1(a, b, c):
    passage = 0
    start_chapter = int(b)
    end_chapter = int(c)

    for i in range(start_chapter, end_chapter + 1):
        driver = webdriver.Chrome()
        
        url = f'https://www.99csw.com/book/{a}/{i}.htm'
        driver.get(url)

        content_element = driver.find_element(By.XPATH, '//div[@id="content"]')
        content = content_element.text
        print(content)

        text.insert(tk.END, content + '\n')

        filename = f"./活着第{i}章.txt"
        if passage == 0:
            with open('./活着前言.txt', "w", encoding='utf-8') as f:
                f.write(content)
            passage += 1
            continue
        else:
            with open(filename, "w", encoding="utf-8") as f:
                f.write(content)
            continue

六、完整代码

import tkinter as tk
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

# 一个背景函数
def set_background_image(window, image_path):
    # 创建一个PhotoImage对象
    background_image = tk.PhotoImage(file=image_path)

    # 将PhotoImage对象设置为窗口的背景
    background_label = tk.Label(window, image=background_image)
    background_label.place(x=0, y=0, relwidth=1, relheight=1)

    # 保存PhotoImage对象到窗口属性中，避免被垃圾回收
    window.background = background_image


def main():
    input_window = tk.Toplevel(app)
    input_window.geometry(app.geometry())
    set_background_image(input_window, "哆唻A梦.png")
    lb1 = tk.Label(input_window, text='请输入书本号', width=40, height=10, fg='black', font=8)
    lb1.place(x=1, y=1)
    entry1 = tk.Entry(input_window)
    entry1.place(x=130, y=220)
    lb2 = tk.Label(input_window, text='请输入前言的章节号', width=40, height=10, fg='black', font=8)
    lb2.place(x=1, y=250)
    entry2 = tk.Entry(input_window)
    entry2.place(x=130, y=470)
    lb3 = tk.Label(input_window, text='请输入最后一个章节的章节号', width=40, height=10, fg='black', font=8)
    lb3.place(x=1, y=500)
    entry3 = tk.Entry(input_window)
    entry3.place(x=130, y=720)
    confirm_button = tk.Button(input_window, text="确认", command=lambda: process_input1(entry1.get(), entry2.get(), entry3.get()))
    confirm_button.place(x=180, y=750)

def process_input1(a, b, c):
    passage = 0
    start_chapter = int(b)
    end_chapter = int(c)

    for i in range(start_chapter, end_chapter + 1):
        driver = webdriver.Chrome()
        
        url = f'https://www.99csw.com/book/{a}/{i}.htm'
        driver.get(url)

        content_element = driver.find_element(By.XPATH, '//div[@id="content"]')
        content = content_element.text
        print(content)

        text.insert(tk.END, content + '\n')

        filename = f"./活着第{i}章.txt"
        if passage == 0:
            with open('./活着前言.txt', "w", encoding='utf-8') as f:
                f.write(content)
            passage += 1
            continue
        else:
            with open(filename, "w", encoding="utf-8") as f:
                f.write(content)
            continue


app = tk.Tk()

app.title("九九藏书网")
app.geometry('1920x1080')
set_background_image(app,"哆唻A梦.png")

# 添加一个Text组件用于显示爬取的内容
text = tk.Text(app, width=50, height=40, font=('Arial', 12), wrap=tk.WORD)
text.place(x=10, y=10)

lb = tk.Label(app, text='欢迎来到“九九藏书网”书籍采集获取系统', width=40, height=1, fg='black', font=('Arial', 18))
lb.pack()

bt = tk.Button(app, text="开始", width=10, command=main, activebackground='red')
bt.pack()

app.mainloop()