2023数据采集与融合技术实践作业四

发布时间 2023-11-06 23:50:25作者: 白炎273592

实验内容

• 作业①:

要求:

▪ 熟练掌握 Selenium 查找 HTML 元素、爬取 Ajax 网页数据、等待 HTML 元素等内
容。
▪ 使用 Selenium 框架+ MySQL 数据库存储技术路线爬取“沪深 A 股”、“上证 A 股”、
“深证 A 股”3 个板块的股票数据信息。
o 候选网站:东方财富网:
http://quote.eastmoney.com/center/gridlist.html#hs_a_board
o 输出信息:MYSQL 数据库存储和输出格式如下,表头应是英文命名例如:序号
id,股票代码:bStockNo……,由同学们自行定义设计表头:
• Gitee 文件夹链接

序号 股票 代码 股票 名称 最新 报价 涨跌 幅涨跌额 成交量 成交额 振幅 最高 最低 今开 昨收
1 68809 3 N世华 28.4 7 62.22% 10.92 26.1 3万 7.6亿 22.34 32.0 28.08 30.2 17.55
2 .....

代码:

import re
import sqlite3
import requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import pymysql
import json
from bs4 import UnicodeDammit
import urllib.request

from selenium.webdriver.common.by import By


# 股票数据库
class stockDB:
    # 开启
    def openDB(self):
        self.con = sqlite3.connect("stocks.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("create table stocks (Num varchar(16),"
                                " Code varchar(16),names varchar(16),"
                                "Price varchar(16),"
                                "Quote_change varchar(16),"
                                "Updownnumber varchar(16),"
                                "Volume varchar(16),"
                                "Turnover varchar(16),"
                                "Swing varchar(16),"
                                "Highest varchar(16),"
                                "Lowest varchar(16),"
                                "Today varchar(16),"
                                "Yesday varchar(16))")
        except:
            self.cursor.execute("delete from stocks")

    # 关闭
    def closeDB(self):
        self.con.commit()
        self.con.close()

    # 插入数据
    def insert(self,Num,Code,names,Price,Quote_change,Updownnumber,Volume,Turnover,Swing,Highest,Lowest,Today,Yesday):
        try:
            self.cursor.execute("insert into stocks(Num,Code,names,Price,Quote_change,Updownnumber,Volume,Turnover,Swing,Highest,Lowest,Today,Yesday)"
                                " values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
                                (Num,Code,names,Price,Quote_change,Updownnumber,Volume,Turnover,Swing,Highest,Lowest,Today,Yesday))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("select * from stocks")
        rows = self.cursor.fetchall()
        # print("{:4}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:16}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}".format('序号','代码','名称','最新价','涨跌幅(%)','跌涨额(¥)','成交量(手)','成交额(¥)','振幅(%)','最高','最低','今开','昨收'))
        for row in rows:
            print("{:4}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:16}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}".format(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12]))


def get_stock(url):
    service = Service(
        executable_path="C:/Users/白炎/AppData/Local/Programs/Python/Python310/Scripts/chromedriver.exe")
    driver = webdriver.Chrome(service=service)
    driver.get(url)
    trs = driver.find_elements(By.XPATH, '//tbody/tr')
    stocks = []
    for tr in trs:
        tds = tr.find_elements(By.XPATH, './td')
        td = [x.text for x in tds]
        stocks.append(td)
        # print(stocks[1])
    stockInfo = []
    for stock in stocks:
        stockInfo.append((stock[0], stock[1], stock[2], stock[4], stock[5], stock[6], stock[7], stock[8], stock[9],
                          stock[10], stock[11], stock[12], stock[13]))
    # print(stockInfo)
    for st in stockInfo:
        insertDB.openDB()
        insertDB.insert(st[0], st[1], st[2], st[3], st[4], st[5], st[6], st[7], st[8], st[9], st[10], st[11], st[12])
        insertDB.show()
        insertDB.closeDB()
    return stockInfo


def main():
    global insertDB
    insertDB= stockDB()
    stockInfo= []
    print("开爬")
    print("{:4}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:16}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}".format('序号', '代码', '名称',
                                                                                                 '最新价', '涨跌幅(%)',
                                                                                                 '跌涨额(¥)',
                                                                                                 '成交量(手)',
                                                                                                 '成交额(¥)', '振幅(%)',
                                                                                                 '最高', '最低', '今开',
                                                                                                 '昨收'))

    urls=["http://quote.eastmoney.com/center/gridlist.html#hz_a_board","http://quote.eastmoney.com/center/gridlist.html#sh_a_board","http://quote.eastmoney.com/center/gridlist.html#sz_a_board"]
    count = 0
    for url in urls:

        stockInfo=get_stock(url)

        count+= 1
        print("第{}个".format(count))



        # 提取主要数据/提取全部数据
        df = pd.DataFrame(stockInfo)
        columns = {0: '序号', 1: '代码', 2: '名称', 3: '最新价', 4: '涨跌幅(%)', 5: '跌涨额(¥)', 6: '成交量(手)', 7: '成交额(¥)', 8: '振幅(%)', 9: '最高', 10: '最低', 11: '今开', 12: '昨收'}
        df.rename(columns=columns, inplace=True)
        if count==1:
            df.to_csv('沪深 A 股.csv', encoding='utf-8-sig', index=False)
        elif count==2:
            df.to_csv('上证 A 股.csv', encoding='utf-8-sig', index=False)
        elif count==3:
            df.to_csv('深证 A 股.csv', encoding='utf-8-sig', index=False)
        else:
            pass

    print(stockInfo)

if __name__ == '__main__':
    main()



结果






• 作业②:

o 要求:

▪ 熟练掌握 Selenium 查找 HTML 元素、实现用户模拟登录、爬取 Ajax 网页数据、
等待 HTML 元素等内容。
▪ 使用 Selenium 框架+MySQL 爬取中国 mooc 网课程资源信息(课程号、课程名
称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
o 候选网站:中国 mooc 网:https://www.icourse163.org
o 输出信息:MYSQL 数据库存储和输出格式
• Gitee 文件夹链接
o
o

Id cCourse cCollege cTeacher cTeam cCount cProcess cBrief
1 Python数据分析与展示 北京理工大学 嵩天 嵩天 470 2020年11月17日~2020年12月29日 “我们正步入一个数据或许比软件更重要的新时代。——TimO'Reilly” ……
2......

代码

import sqlite3
import urllib.request

import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
# 导入鼠标操作的包
from selenium.webdriver.common.action_chains import ActionChains
# 导入显示元素等待的包
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time



class stockDB:
    # 开启
    def openDB(self):
        self.con = sqlite3.connect("mooc.db")
        self.cursor = self.con.cursor()
        try:
            # cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief
            self.cursor.execute("create table mooc ("
                                "Id  CHAR(20),"
                                "cCourse  CHAR(20),"
                                "cCollege  CHAR(20),"
                                "cTeacher  CHAR(20),"
                                "cTeam  CHAR(20),"
                                "cCount  CHAR(20),"
                                "cProcess  CHAR(20),"
                                "cBrief  CHAR(20))")
        except:
            self.cursor.execute("delete from mooc")

    # 关闭
    def closeDB(self):
        self.con.commit()
        self.con.close()

    # 插入数据
    def insert(self,Id,cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief):
        try:
            self.cursor.execute('insert into mooc( Id, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)values (?,?,?,?,?,?,?,?)',(Id, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("select * from mooc")
        rows = self.cursor.fetchall()
        for row in rows:
            print("{:4}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}\t{:8}".format(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7]))

global insertDB
Id=0
insertDB = stockDB()
edge_options = webdriver.EdgeOptions()
browser = webdriver.Chrome(options=edge_options)
url = 'https://www.icourse163.org/'
browser.get(url)  # 获取网页
time.sleep(1)  # 等待网页加载
    # 获取登录按键
loginbutton = browser.find_element(By.XPATH, '//div[@class="_1Y4Ni"]/div')
    # 点击按键
loginbutton.click()
time.sleep(3)
    # 转换到iframe
frame = browser.find_element(By.XPATH,'/html/body/div[13]/div[2]/div/div/div/div/div/div[1]/div/div[1]/div[2]/div[2]/div[1]/div/iframe')
browser.switch_to.frame(frame)
    # 输入账号
account = browser.find_element(By.ID, 'phoneipt').send_keys('18050108037')
    # 输入密码
password = browser.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div/div[4]/div[2]/input[2]').send_keys("CXYfaq333")
    # 获取登录按钮
logbutton = browser.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div/div[6]/a')
    # 点击按钮
logbutton.click()

    # 等待登录成功页面加载完成
time.sleep(20)
    # 跳出iframe
browser.switch_to.default_content()

    # 等待页面成功加载
wait = WebDriverWait(browser, 10)
element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".inputtxt")))

    # 找到输入框并输入内容
phone_input = browser.find_element(By.CSS_SELECTOR, ".inputtxt")
phone_input.send_keys("计算机")
# 点击搜索按钮
searchbutton = browser.find_element(By.CSS_SELECTOR, ".j-searchBtn")
searchbutton.click()

time.sleep(2)

sts = []
for i in range(5):
    time.sleep(10)
    # 等待页面成功加载
    wait = WebDriverWait(browser, 10)  # 设置显式等待时间为10秒
    element = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".th-bk-main-gh")))  # 等待元素可见

    div_elements = browser.find_elements(By.XPATH, '//div[@class="m-course-list"]/div/div')
    for div_element in div_elements:
        eles = div_element.text
        ele = eles.split('\n')
        st = ele[1].split(" ")
        cCourse = ele[0]
        cCollege = st[0]
        cTeacher = "".join(st[1:])
        # 把st[2]到st[-1]合并为一个字符串
        if len(st) >= 2:
            cTeam = st[1]
        else:
            continue
        ct = ele[-1]
        # print(ct)
        ct1 = ct.split("人参加")
        # print(ct1)
        cCount = ct1[0]
        cProcess = ct1[1]
        cBrief = str(ele[2])
        # print(Id, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
        st = [Id, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief]
        sts.append(st)
        # print(ele)
        # print(course)#课程
        # print(school)#学校
        # print(team)#团队
        # print(teacher)#老师
        # print(ct1[0])#人数
        # print(ct1[1])#时间
        # print(briefs)  # 简介
        # print(count,"-------------------")
        Id += 1
        insertDB.openDB()
        insertDB.insert(Id, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
        insertDB.show()  # 显示数据库内容
        insertDB.closeDB()
        # print(st)
    time.sleep(10)
    nextbutton = browser.find_element(By.CSS_SELECTOR, ".th-bk-main-gh")
    nextbutton.click()
        # 提取主要数据/提取全部数据
    df = pd.DataFrame(sts)
    columns = { 0: 'Id',1: 'cCourse',2: 'cCollege',3: 'cTeacher',4: 'cTeam',5: 'cCount',6: 'cProcess',7: 'cBrief'}
    df.rename(columns=columns, inplace=True)
    df.to_csv('mooc.csv', encoding='utf-8-sig', index=False)  # 保存为csv格式的文件(没有打开db的软件)

print('over')
browser.quit()

结果




• 作业③:

o 要求:

• 掌握大数据相关服务,熟悉 Xshell 的使用
• 完成文档 华为云_大数据实时分析处理实验手册-Flume 日志采集实验(部
分)v2.docx 中的任务,即为下面 5 个任务,具体操作见文档。
• 环境搭建:
·任务一:开通 MapReduce 服务

• 实时分析开发实战:
·任务一:Python 脚本生成测试数据

·任务二:配置 Kafka

·任务三: 安装 Flume 客户端



·任务四:配置 Flume 采集数据