2023数据采集与融合第四次作业

发布时间 2023-11-15 22:37:57作者: nunusu

作业①:

实验内容

题目:

  • 要求:熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
    使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
  • 候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board
  • 输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头。
code
from selenium import webdriver
import pymysql
import time


class MySpider:
    def startUp(self, gp):
        self.driver = webdriver.Chrome()
        try:
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", database="stocks", charset="utf8")
            self.cursor = self.con.cursor()
            try:
                self.cursor.execute(f"drop table if exists {gp}stocks")
                self.cursor.execute(
                    f"create table {gp}stocks(id int,code varchar(10),name varchar(20),lastTrade varchar(10),chg varchar(10),"
                    "change_ varchar(10),volume varchar(10),amp varchar(10),highest varchar(10),lowest varchar(10),open varchar(10),prevClose varchar(10),primary key (id))")
            except:
                pass
        except Exception as err:
            print(err)

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.close()
        except Exception as err:
            print(err)

    def insertDB(self, gp, id, code, name, lastTrade, chg, change_, volume, amp, highest, lowest, open, prevClose):
        try:
            self.cursor.execute(
                f"insert into {gp}stocks (id, code, name, lastTrade, chg, change_, volume, amp, highest, lowest, open, prevClose) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                (id, code, name, lastTrade, chg, change_, volume, amp, highest, lowest, open, prevClose))
        except Exception as err:
            print(err)

    def processSpider(self):
        try:
            trs = self.driver.find_elements_by_xpath("//tbody/tr")
            for tr in trs:
                id = tr.find_element_by_xpath(".//td[1]").text
                code = tr.find_element_by_xpath(".//td[2]").text
                name = tr.find_element_by_xpath(".//td[3]").text
                lastTrade = tr.find_element_by_xpath(".//td[5]").text  # 最新报价
                chg = tr.find_element_by_xpath(".//td[6]").text  # 涨跌幅
                change_ = tr.find_element_by_xpath(".//td[7]").text  # 涨跌额
                volume = tr.find_element_by_xpath(".//td[8]").text  # 成交量
                amp = tr.find_element_by_xpath(".//td[10]").text  # 振幅
                highest = tr.find_element_by_xpath(".//td[11]").text  # 最高
                lowest = tr.find_element_by_xpath(".//td[12]").text  # 最低
                open = tr.find_element_by_xpath(".//td[13]").text  # 今开
                prevClose = tr.find_element_by_xpath(".//td[14]").text  # 昨收
                self.insertDB(gp, id, code, name, lastTrade, chg, change_, volume, amp, highest, lowest, open, prevClose)
            try:
                link = self.driver.find_elements_by_xpath("//a[@class='next paginate_button']")[0]
                if int(link.get_attribute("data-index")) >= 3:  # 爬取2页
                    return
                link.click()
                time.sleep(1)  # 等1秒等待网页数据加载
                self.processSpider()
            except Exception as err:
                print(err)
        except Exception as err:
            print(err)

    def executeSpider(self, gp):
        self.startUp(gp)
        self.driver.get(f"http://quote.eastmoney.com/center/gridlist.html#{gp}_a_board")
        time.sleep(1)  # 等1秒等待网页数据加载
        self.processSpider()
        self.closeUp()


spider = MySpider()
gps = ["hs", "sh", "sz"]
for gp in gps:
    spider.executeSpider(gp)


运行结果:


心得体会

重顾了一次selenium的相关操作。

作业②:

实验内容

题目

  • 要求:熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
    使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
  • 候选网站:中国mooc网:https://www.icourse163.org
  • 输出信息:MYSQL数据库存储和输出格式
    Gitee文件夹链接
code
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
import time
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

class MySpider:
    def __init__(self):
        try:
            option = Options()
            option.add_experimental_option('excludeSwitches', ['enable-automation'])
            self.driver = webdriver.Chrome(executable_path='chromedriver.exe', options=option)
            self.driver.maximize_window()  # 使浏览器窗口最大化
            self.con = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", database="mooc",
                                       charset="utf8")
            self.cursor = self.con.cursor()
            self.cursor.execute("drop table if exists CourseInfo")
            sql = '''create table CourseInfo(cCourse varchar(20),cCollege varchar(20),cTeacher varchar(20),
                cTeam varchar(40),cCount varchar(10),cProcess varchar(20),cBrief varchar(2048))'''
            self.cursor.execute(sql)
        except Exception as e:
            print(e)

    def close(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.quit()
        except Exception as err:
            print(err)

    def insert_db(self, cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief):
        try:
            self.cursor.execute(
                "insert into CourseInfo (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief) values(%s, %s, %s, %s, %s, %s, %s)",
                (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief))
        except Exception as err:
            print(err)

    def login(self):
        WebDriverWait(self.driver, 10, 0.48).until(EC.presence_of_element_located((By.XPATH, '//a[@class="f-f0 navLoginBtn"]'))).click()
        self.driver.switch_to.default_content()
        iframe = WebDriverWait(self.driver, 10, 0.48).until(EC.presence_of_element_located((By.XPATH, '//*[@frameborder="0"]')))
        self.driver.switch_to.frame(iframe)
        # 输入账号密码并点击登录按钮
        self.driver.find_element_by_xpath('//*[@id="phoneipt"]').send_keys("15105909274")
        time.sleep(2)
        self.driver.find_element_by_xpath('//*[@class="j-inputtext dlemail"]').send_keys("xcl2357.")
        time.sleep(2)
        self.driver.find_element_by_id('submitBtn').click()
        self.driver.switch_to.default_content()

        self.driver.get(WebDriverWait(self.driver,10,0.48).until(EC.presence_of_element_located((By.XPATH,'//*[@id="app"]/div/div/div[1]/div[1]/div[1]/span[1]/a'))).get_attribute('href'))

    def process_spider(self):
        time.sleep(5)  # 等5秒等待网页数据加载
        print("begin")
        try:
            courses = self.driver.find_elements(By.XPATH, '//*[@id="channel-course-list"]/div/div/div[2]/div[1]/div')
            current_window_handle = self.driver.current_window_handle
            for course in courses:
                cCourse = course.find_element_by_xpath(".//h3").text
                cCollege = course.find_element_by_xpath(".//p[@class='_2lZi3']").text
                cTeacher = course.find_element_by_xpath(".//div[@class='_1Zkj9']").text
                cCount = course.find_element_by_xpath(".//div[@class='jvxcQ']/span").text
                cProcess = course.find_element_by_xpath(".//div[@class='jvxcQ']/div").text
                course.click()
                Handles = self.driver.window_handles
                self.driver.switch_to.window(Handles[1])
                cBrief = self.driver.find_element_by_xpath('//*[@id="j-rectxt2"]').text.strip()
                cTeam = ""
                while True:
                    Teachers = self.driver.find_elements_by_xpath("//div[@class='um-list-slider_con_item']/div/div/h3")

                    for Teacher in Teachers:
                        cTeam += Teacher.text+"."
                    try:
                        self.driver.find_element_by_xpath("//div[@class='um-list-slider_next f-pa']").click()
                    except:
                        break
                print(cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
                self.driver.close()  # 关闭新标签页
                self.driver.switch_to.window(current_window_handle)  # 跳转回原始页面
                self.insert_db(cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief)
            try:  # 翻页
                link = self.driver.find_element_by_xpath('//*[@id="channel-course-list"]/div/div/div[2]/div[2]/div')
                if int(link.find_element_by_xpath(".//a[@class='_20fst _6jBq']").text) <= 3:
                    try:
                        link.find_element_by_xpath(".//a[@class='_3YiUU '][2]").click()
                    except:
                        link.find_element_by_xpath(".//a[@class='_3YiUU '][1]").click()
                    js = 'window.scrollTo(0,document.body.scrollTop=0)'
                    self.driver.execute_script(js)
                    self.process_spider()
            except Exception as err:
                print(err)

        except Exception as err:
            print(err)

    def execute_spider(self, url):
        self.driver.get(url)
        self.login()
        self.process_spider()


spider = MySpider()
spider.execute_spider("https://www.icourse163.org")  # 中国mooc网
spider.close()

运行结果:

心得体会

对selenum的操作有了更多的认识。

作业③

实验内容

题目:

  • 要求:掌握大数据相关服务,熟悉Xshell的使用
    完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。

  • 输出:实验关键步骤或结果截图。

实验过程:

环境搭建
任务一:开通MapReduce服务
这是创建使用后删除的MRS集群,删除前没截图。

实时分析开发实战:
任务一:Python脚本生成测试数据
xshell连接服务器

xftp传输文件

运行python脚本

输出结果

任务二:配置Kafka

任务三: 安装Flume客户端


任务四:配置Flume采集数据

心得体会

学会了使用xshell连接到服务器并进行操作,并学会了大数据服务的相关配置。

码云链接

wait...