2023数据采集与融合技术实践作业四

发布时间 2023-11-15 21:15:30作者: alzz94

2023数据采集与融合技术实践作业4

实验4.1

  • 要求:
    • 熟练掌握 Selenium 查找 HTML 元素、爬取 Ajax 网页数据、等待 HTML 元素等内
      容。
    • 使用 Selenium 框架+ MySQL 数据库存储技术路线爬取“沪深 A 股”、“上证 A 股”、“深证 A 股”3 个板块的股票数据信息。
      候选网站:东方财富http://quote.eastmoney.com/center/gridlist.html#hs_a_board
  • 代码:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import sqlite3





def spider():
    index = 1
    max_page = driver.find_element(By.CSS_SELECTOR,"#main-table_paginate > span.paginate_page > a:nth-child(5)").text
    max_page = eval(max_page)
    max_page = 5
    print(max_page)
    for page in range(max_page):
        # WebDriverWait(driver, 10).until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR,"#table_wrapper-table > tbody > tr")))
        trlist = driver.find_elements(By.CSS_SELECTOR,"#table_wrapper-table > tbody > tr")
        for tr in trlist:
            res1 = [str(index)]
            index += 1
            for i in [2,3,5,6,7,8,9,10,11,12,13,14]:
                res1.append(tr.find_element(By.CSS_SELECTOR,"td:nth-child(" + str(i)+ ")").text)
            print(res1)
            res.append(res1)
        if page <= max_page - 2 :
            next_button = driver.find_element(By.XPATH,'/html/body/div[1]/div[2]/div[2]/div[5]/div/div[2]/div/a[2]')
            webdriver.ActionChains(driver).move_to_element(next_button ).click(next_button ).perform()
            time.sleep(1.5)


edge_options = Options()
driver = webdriver.Edge(options=edge_options)
res = []

driver.get("http://quote.eastmoney.com/center/gridlist.html#sh_a_board")
try:
    spider()
except Exception as e:
    print("------------------------------------error---------------------------------------")
    print(e)

driver.execute_script("window.open('http://quote.eastmoney.com/center/gridlist.html#sz_a_board','_self');")
time.sleep(1.5)
try:
    spider()
except Exception as e:
    print("------------------------------------error---------------------------------------")
    print(e)
# driver.quit()

driver.execute_script("window.open('http://quote.eastmoney.com/center/gridlist.html#bj_a_board','_self');")
time.sleep(1.5)
try:
    spider()
except Exception as e:
    print("------------------------------------error---------------------------------------")
    print(e)

db = sqlite3.connect('w0.db')
sql_text_1 = '''CREATE TABLE scores
                (序号 TEXT,
                    股票代码 TEXT,
                    股票名称 TEXT,
                    最新报价 TEXT,
                    涨跌幅 TEXT,
                    涨跌额 TEXT,
                    成交量 TEXT,
                    成交额 TEXT,
                    振幅 TEXT,
                    最高 TEXT,
                    最低 TEXT,
                    今开 TEXT,
                    昨收 TEXT);'''
db.execute(sql_text_1)
db.commit()

for item in res:
    sql_text_1 = "INSERT INTO scores VALUES('"+item[0] + "'"
    
    for i in range(len(item) - 1):
        sql_text_1  = sql_text_1 + ",'" + item[i+1] + "'"

    sql_text_1 = sql_text_1 +  ")"
    print(sql_text_1)
    db.execute(sql_text_1)
    db.commit()
    
db.close()
  • 实验结果:
  • 心得体会:
    学会了使用Selenium库进行自动化网页操作

实验4.2

  • 要求:
    • 熟练掌握 Selenium 查找 HTML 元素、实现用户模拟登录、爬取 Ajax 网页数据、
      等待 HTML 元素等内容。
    • 使用 Selenium 框架+MySQL 爬取中国 mooc 网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
      候选网站:中国 mooc 网:https://www.icourse163.org
  • 代码:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from scrapy.selector import Selector
import time

from sqlalchemy import create_engine


browser = webdriver.Chrome()
url = 'https://www.icourse163.org/'
browser.get(url)
time.sleep(2)

# 登录
denglu=browser.find_element(By.XPATH,'//*[@id="app"]/div/div/div[1]/div[3]/div[3]/div')
denglu.click()
time.sleep(3)
browser.switch_to.default_content()
browser.switch_to.frame(browser.find_elements(By.TAG_NAME,'iframe')[0])


phone=browser.find_element(By.XPATH,'/html/body/div[2]/div[2]/div[2]/form/div/div[2]/div[2]/input')
phone.clear()
phone.send_keys("13506915655")
time.sleep(3)
password=browser.find_element(By.XPATH,'/html/body/div[2]/div[2]/div[2]/form/div/div[4]/div[2]/input[2]')
password.clear()
password.send_keys("Chs123456.")
deng=browser.find_element(By.XPATH,'//*[@id="submitBtn"]')
deng.click()
time.sleep(5)
browser.switch_to.default_content()
select_course=browser.find_element(By.XPATH,'/html/body/div[4]/div[1]/div/div/div/div/div[7]/div[1]/div/div/div[1]/div/div/div/div/div/div/input')
select_course.send_keys("python")
dianji=browser.find_element(By.XPATH,'//html/body/div[4]/div[1]/div/div/div/div/div[7]/div[1]/div/div/div[2]/span')
dianji.click()
time.sleep(3)

content = browser.page_source
print(content)
# 退出
browser.quit()
selector = Selector(text=content)
rows = selector.xpath("//div[@class='m-course-list']/div/div")

data = []
# 遍历每个<tr>元素
for row in rows:
    lis = []
    course= row.xpath(".//span[@class=' u-course-name f-thide']//text()").extract()
    course_string="".join(course)
    school=row.xpath(".//a[@class='t21 f-fc9']/text()").extract_first()
    teacher=row.xpath(".//a[@class='f-fc9']//text()").extract_first()
    team = row.xpath(".//a[@class='f-fc9']//text()").extract()
    team_string=",".join(team)
    number = row.xpath(".//span[@class='hot']/text()").extract_first()
    time = row.xpath(".//span[@class='txt']/text()").extract_first()
    jianjie=row.xpath(".//span[@class='p5 brief f-ib f-f0 f-cb']//text()").extract()
    jianjie_string=",".join(jianjie)

    lis.append(course_string)
    lis.append(school)
    lis.append(teacher)
    lis.append(team_string)
    lis.append(number)
    lis.append(time)
    lis.append(jianjie_string)

    data.append(lis)
df = pd.DataFrame(data=data, columns=['course','school','teacher','team','number','time','jianjie'])
print(df)
engine = create_engine("mysql+mysqlconnector://root:@127.0.0.1:3306/selenium")
df.to_sql('mooc',engine,if_exists="replace")
  • 运行结果:
  • 心得体会:
    使用了Selenium库,加深了对它的影响

实验4.3

  • 要求:
    • 掌握大数据相关服务,熟悉 Xshell 的使用
    • 完成文档 华为云_大数据实时分析处理实验手册-Flume 日志采集实验(部
      分)v2.docx 中的任务,即为下面 5 个任务,具体操作见文档。
      环境搭建:
      任务一:开通 MapReduce 服务
      实时分析开发实战:
      任务一:Python 脚本生成测试数据
      任务二:配置 Kafka
      任务三: 安装 Flume 客户端
      任务四:配置 Flume 采集数据
  • 运行结果
    1.任务一:开通 MapReduce 服务:

    2.任务一:Python 脚本生成测试数据:

    2.任务二:配置 Kafka:

    2.任务三: 安装 Flume 客户端:

    2.任务四:配置 Flume 采集数据:
  • 心得体会:
    第一次使用华为云实战开发