第四次作业
一、作业内容
作业①:
- 要求:
- 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
- 使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
- 候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board
- 输出信息: MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:
- Gitee文件夹链接
- 代码
from selenium import webdriver from selenium.webdriver.common.by import By import pymysql class BankDB: def openDB(self): self.con = pymysql.connect(host='localhost', user='root', password='LiamCapis1', db='bank') self.cursor = self.con.cursor() try: self.cursor.execute( 'CREATE TABLE stocks (id INT NOT NULL AUTO_INCREMENT PRIMARY KEY,\ name VARCHAR(255) NOT NULL,\ code VARCHAR(10) NOT NULL,\ latest_price VARCHAR(10) NOT NULL,\ price_change VARCHAR(10) NOT NULL,\ price_change_percent VARCHAR(10) NOT NULL,\ volume VARCHAR(10) NOT NULL,\ turnover VARCHAR(10) NOT NULL,\ amplitude VARCHAR(10) NOT NULL,\ highest VARCHAR(10) NOT NULL,\ lowest VARCHAR(10) NOT NULL,\ opening_price VARCHAR(10) NOT NULL,\ closing_price VARCHAR(10) NOT NULL)') except: self.cursor.execute("delete from stocks") def closeDB(self): self.con.commit() self.con.close() def insert(self, name, code, latest_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, opening_price, closing_price): try: self.cursor.execute("INSERT INTO stocks (name, code, latest_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, opening_price, closing_price) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);", (name, code, latest_price, price_change, str(price_change_percent), str(volume), str(turnover), str(amplitude), highest, lowest, opening_price, closing_price)) except Exception as err: print(err) bank_db = BankDB() bank_db.openDB() page_index = 1 driver = webdriver.Chrome() fs = { "沪深京A": "#hs_a_board", "上证A": "#sh_a_board", "深证A": "#sz_a_board", } for gupiao in fs: url = f"http://quote.eastmoney.com/center/gridlist.html{fs[gupiao]}" driver.get(url) message = driver.find_elements(By.XPATH, "//tbody//tr") for i in message: NO = i.find_element(By.XPATH,'./td[1]').text daima = i.find_element(By.XPATH, './td[2]/a').text name = i.find_element(By.XPATH, './td[@class="mywidth"]/a').text newprice = i.find_element(By.XPATH, './td[5]/span').text diezhanfu = i.find_element(By.XPATH, './td[6]/span').text diezhane = i.find_element(By.XPATH, './td[7]/span').text cjl = i.find_element(By.XPATH, './td[8]').text cje = i.find_element(By.XPATH, './td[9]').text zf = i.find_element(By.XPATH, './td[10]').text zg = i.find_element(By.XPATH, './td[11]/span').text zd = i.find_element(By.XPATH, './td[12]/span').text jk = i.find_element(By.XPATH, './td[13]/span').text zs = i.find_element(By.XPATH, './td[14]').text bank_db.insert( name, daima, newprice, diezhane, diezhanfu, cjl, cje, zf, zg, zd, jk, zs ) print("{:<10} {:<10} {:<10} {:<10} {:<10} {:<10} {:<20} {:<10} {:<10} {:<10} {:<10} {:<10}".format( name, daima, newprice, diezhane, diezhanfu, cjl, cje, zf, zg, zd, jk, zs )) driver.quit() driver = webdriver.Chrome() bank_db.closeDB() driver.quit()
- 运行结果截图:
控制台输出:
mysql:
- 心得体会
selenium还是挺方便的,就是得保证chrome和driver的版本一致,之前配置好的上课时发现chrome自动更新了,导致运行半天没有任何结果,浪费了好多时间。
作业②:
- 要求:
- 熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
- 使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
- 候选网站:中国mooc网:https://www.icourse163.org
- 输出信息:MYSQL数据库存储和输出格式
- Gitee文件夹链接
- 代码
from selenium import webdriver from selenium.webdriver.common.by import By import pymysql driver = webdriver.Chrome() url = "https://www.icourse163.org/search.htm?search=%E5%A4%A7%E6%95%B0%E6%8D%AE#/" data_list = [] def start_spider(): # 请求url driver.get(url) # 开始提取信息,找到ul标签下的全部li标签 count = 0 for link in driver.find_elements(By.XPATH, '//div[@class="u-clist f-bgw f-cb f-pr j-href ga-click"]'): count += 1 # 课程名称 course_name = link.find_element(By.XPATH, './/span[@class=" u-course-name f-thide"]').text print("course:", course_name) # 开课学校 school_name = link.find_element(By.XPATH, './/a[@class="t21 f-fc9"]').text print("school:", school_name) # 主讲教师 main_teacher = link.find_element(By.XPATH, './/a[@class="f-fc9"]').text print("main_teacher:", main_teacher) # 团队成员 try: team_member = link.find_element(By.XPATH, './/span[@class="f-fc9"]').text except Exception as err: team_member = 'none' team_member = team_member.replace("、", "", 1) team_member = team_member.replace(" ", "", 1) print("team_teacher:", team_member) # 参加人数 participants = link.find_element(By.XPATH, './/span[@class="hot"]').text participants = participants.replace("参加", "") print("participants:", participants) # 课程进度 course_progress = link.find_element(By.XPATH, './/span[@class="txt"]').text print('course_progress:', course_progress) # 课程简介 introduction = link.find_element(By.XPATH, './/span[@class="p5 brief f-ib f-f0 f-cb"]').text print(introduction) conn = pymysql.connect(host='localhost', user='root', password='LiamCapis1', db='bank') cursor = conn.cursor() try: if count == 1: # cursor.execute("delete from moocs") sql = """CREATE TABLE moocs ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, course VARCHAR(255) NOT NULL, college VARCHAR(255) NOT NULL, main_teacher VARCHAR(255) NOT NULL, team_member VARCHAR(255) NOT NULL, participants VARCHAR(255) NOT NULL, course_progress VARCHAR(255) NOT NULL, brief VARCHAR(255) NOT NULL )""" cursor.execute(sql) cursor.execute( "INSERT INTO moocs (`course`,`college`,`main_teacher`,`team_member`,`participants`,`course_progress`,`brief`) VALUES (%s,%s,%s,%s,%s,%s,%s)", (course_name, school_name, main_teacher, team_member, participants, course_progress, introduction) ) except Exception as err: print(err) # 关闭连接 conn.commit() conn.close() def main(): start_spider() if __name__ == '__main__': main() driver.quit()
- 运行结果截图:
控制台输出:
mysql:
- 心得体会
selenium相对来说是比较轻松的,就是在获取数据的过程有些麻烦,教师团队就有两种情况需要用捕捉异常。
作业③:
-
要求:
- 掌握大数据相关服务,熟悉Xshell的使用
- 完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。
- 环境搭建:
- 任务一:开通MapReduce服务
-
实时分析开发实战:
-
任务一:Python脚本生成测试数据
-
任务二:配置Kafka
-
任务三: 安装Flume客户端
-
任务四:配置Flume采集数据
-
-
心得体会
学习了华为云平台的使用,并且发现原来在CloudShell里也可以打开两个终端页面,只需要找对ip地址和节点