2023数据采集与融合技术实践四

发布时间 2023-11-15 23:02:43作者: chencanming

一、作业内容

码云链接

ccm4 · 陈灿铭/2023级数据采集与融合技术 - 码云 - 开源中国 (gitee.com)

  • 作业①:
  • 要求:
    • 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
    • 使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
  • 候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board
  • 输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:

 

 

序号

股票代码

股票名称

最新报价

涨跌幅

涨跌额

成交量

成交额

振幅

最高

最低

今开

昨收

1

688093

N世华

28.47

62.22%

10.92

26.13万

7.6亿

22.34

32.0

28.08

30.2

17.55

2......

                       

 代码

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import sqlite3
import time

class MySpider:
    header = {
        "User-Agent": "Mozilla/5.0(Windows;U;Windows NT 6.0 x64;en-US;rv:1.9pre)Gecko/2008072531 Minefield/3.0.2pre"
    }

    def __init__(self):
        self.count = 0  # Move count initialization to __init__

    def start(self, url):
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Correct argument format
        chrome_options.add_argument("--disable-gpu")
        self.driver = webdriver.Chrome(options=chrome_options)
        self.num = 0

        try:
            self.con = sqlite3.connect("stock.db")
            self.cursor = self.con.cursor()
            self.cursor.execute("DROP TABLE IF EXISTS stock")  # Use DROP TABLE IF EXISTS
            # Use triple-quoted strings for better readability
            self.cursor.execute("""
                CREATE TABLE IF NOT EXISTS stock (
                    count VARCHAR(256),
                    num VARCHAR(256),
                    stockname VARCHAR(256),
                    lastest_price VARCHAR(64),
                    ddf VARCHAR(64),
                    dde VARCHAR(64),
                    cjl VARCHAR(64),
                    cje VARCHAR(32),
                    zhenfu VARCHAR(32),
                    top VARCHAR(32),
                    low VARCHAR(32),
                    today VARCHAR(32),
                    yestd VARCHAR(32)
                )
            """)
        except Exception as err:
            print(err)

        self.driver.get(url)

    def close_up(self):  # Use snake_case for method names
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print(err)

    def insert_db(self, num, stockname, lastest_price, ddf, dde, cjl, cje, zhenfu, top, low, today, yestd):
        try:
            sql = "INSERT INTO stock (count, num, stockname, lastest_price, ddf, dde, cjl, cje, zhenfu, top, low, today, yestd) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"
            self.cursor.execute(sql, (self.count, num, stockname, lastest_price, ddf, dde, cjl, cje, zhenfu, top, low, today, yestd))
        except Exception as err:
            print(err)

    def show_db(self):
        try:
            con = sqlite3.connect("stock.db")
            cursor = con.cursor()
            print("count", "num", "stockname", "lastest_price", "ddf", "dde", "cjl", "cje", "zhenfu", "top", "low", "today", "yestd")
            cursor.execute("SELECT count, num, stockname, lastest_price, ddf, dde, cjl, cje, zhenfu, top, low, today, yestd FROM stock ORDER BY count")
            rows = cursor.fetchall()
            for row in rows:
                print(row)
            con.close()
        except Exception as err:
            print(err)

    def execute(self, url):
        print("Starting......")
        self.start(url)
        print("Processing......")
        self.process()
        print("Closing......")
        self.close_up()
        print("Completed......")

    def process(self):
        time.sleep(1)
        try:
            lis = self.driver.find_elements(By.XPATH, "//div[@class='listview full']/table[@id='table_wrapper-table']/tbody/tr")
            time.sleep(1)
            for li in lis:
                time.sleep(1)
                num = li.find_element(By.XPATH, ".//td[position()=2]/a[@href]").text
                stockname = li.find_element(By.XPATH, ".//td[@class='mywidth']/a[@href]").text
                lastest_price = li.find_element(By.XPATH, ".//td[position()=5]/span").text
                ddf = li.find_element(By.XPATH, ".//td[position()=6]/span").text
                dde = li.find_element(By.XPATH, ".//td[position()=7]/span").text
                cjl = li.find_element(By.XPATH, ".//td[position()=8]").text
                time.sleep(1)
                cje = li.find_element(By.XPATH, ".//td[position()=9]").text
                zhenfu = li.find_element(By.XPATH, ".//td[position()=10]").text
                top = li.find_element(By.XPATH, ".//td[position()=11]/span").text
                low = li.find_element(By.XPATH, ".//td[position()=12]/span").text
                today = li.find_element(By.XPATH, ".//td[position()=13]/span").text
                yestd = li.find_element(By.XPATH, ".//td[position()=14]").text
                time.sleep(1)
                self.count += 1  # Increment count here
                self.insert_db(num, stockname, lastest_price, ddf, dde, cjl, cje, zhenfu, top, low, today, yestd)
            next_page = self.driver.find_element(By.XPATH, "//div[@class='dataTables_wrapper']//div[@class='dataTables_paginate paging_input']//a[@class='next paginate_button']")
            time.sleep(10)
            self.num += 1
            if self.num < 4:
                next_page.click()
                self.process()
        except Exception as err:
            print(err)

url = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"
spider = MySpider()
spider.execute(url)
spider.show_db()

运行结果

 

 心得体会

学会了使用Selenium 用于网页抓取和自动化。并使用time引入延迟

  • 作业②:
  • 要求:
    • 熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
    • 使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
  • 候选网站:中国mooc网:https://www.icourse163.org
  • 输出信息:MYSQL数据库存储和输出格式

 

 

Id

cCourse

cCollege

cTeacher

cTeam

cCount

cProcess

cBrief

1

Python数据分析与展示

北京理工大学

嵩天

嵩天

470

2020年11月17日 ~ 2020年12月29日

“我们正步入一个数据或许比软件更重要的新时代。——Tim O'Reilly” ……

2......

             

 

 

代码

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from scrapy.selector import Selector
import sqlite3

def login(driver, username, password):
    driver.get("https://www.icourse163.org/")
    driver.maximize_window()
    button = driver.find_element(By.XPATH,'//div[@class="_1Y4Ni"]/div')
    button.click()
    frame = driver.find_element(By.XPATH,
                                '/html/body/div[13]/div[2]/div/div/div/div/div/div[1]/div/div[1]/div[2]/div[2]/div[1]/div/iframe')
    driver.switch_to.frame(frame)

    account = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div/div[2]/div[2]/input')
    account.send_keys(username)
    code = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div/div[4]/div[2]/input[2]')
    code.send_keys(password)
    login_button = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div/div[6]/a')
    login_button.click()
    driver.switch_to.default_content()
    time.sleep(10)

def search_courses(driver, keyword):
    select_course = driver.find_element(By.XPATH, '/html/body/div[4]/div[1]/div/div/div/div/div[7]/div[1]/div/div/div[1]/div/div/div/div/div/div/input')
    select_course.send_keys(keyword)
    search_button = driver.find_element(By.XPATH, '/html/body/div[4]/div[1]/div/div/div/div/div[7]/div[1]/div/div/div[2]/span')
    search_button.click()
    time.sleep(5)
    wait = WebDriverWait(driver, 10)
    element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".th-bk-main-gh")))
    html = driver.page_source
    selector = Selector(text=html)
    datas = selector.xpath("//div[@class='m-course-list']/div/div")

    for i, data in enumerate(datas, start=1):
        name = data.xpath(".//span[@class=' u-course-name f-thide']//text()").extract()
        name = "".join(name)

        schoolname = data.xpath(".//a[@class='t21 f-fc9']/text()").extract_first()

        teacher = data.xpath(".//a[@class='f-fc9']//text()").extract_first()

        team = data.xpath(".//a[@class='f-fc9']//text()").extract()
        team = ",".join(team)

        number = data.xpath(".//span[@class='hot']/text()").extract_first()

        process = data.xpath(".//span[@class='txt']/text()").extract_first()

        production = data.xpath(".//span[@class='p5 brief f-ib f-f0 f-cb']//text()").extract()
        production = ",".join(production)
        store_course_info(i, name, schoolname, teacher, team, number, process, production)

def store_course_info(i, name, school, teacher, team, number, process, production):
    conn = sqlite3.connect('course.db')
    cursor = conn.cursor()

    cursor.execute('''CREATE TABLE IF NOT EXISTS mooc
                    (Id INT, cCourse TEXT, cCollege TEXT, cTeacher TEXT, cTeam TEXT, cCount INT, cProcess TEXT, cBrief TEXT)''')

    cursor.execute("INSERT INTO mooc (Id, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief) VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
                   (i, name, school, teacher, team, number, process, production))

    conn.commit()
    conn.close()

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=chrome_options)

login(driver, '13600761821', 'Chencanming2333')

search_courses(driver, '大数据')


driver.close()

 

运行结果

 心得体会

使用account.send_keys进行模拟登录,之后的爬取和第一题类似。

  • 作业③:
  • 要求:
  • 掌握大数据相关服务,熟悉Xshell的使用
  • 完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。
  • 实时分析开发实战:

任务一:Python脚本生成测试数据

 任务二:配置Kafka

 任务三: 安装Flume客户端

 

任务四:配置Flume采集数据

 

心得体会

跟着加成一步一步做即可,相对简单。