数据采集与融合技术实践作业四

发布时间 2023-11-15 20:54:50作者: Zioywishing

数据采集与融合技术实践作业四

gitee仓库链接:gitee仓库链接
102102141 周嘉辉

作业①

  • 熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
    使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。

代码:

from selenium import webdriver
from selenium.webdriver.edge.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import sqlite3





def spider():
    index = 1
    max_page = driver.find_element(By.CSS_SELECTOR,"#main-table_paginate > span.paginate_page > a:nth-child(5)").text
    max_page = eval(max_page)
    max_page = 5
    print(max_page)
    for page in range(max_page):
        # WebDriverWait(driver, 10).until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR,"#table_wrapper-table > tbody > tr")))
        trlist = driver.find_elements(By.CSS_SELECTOR,"#table_wrapper-table > tbody > tr")
        for tr in trlist:
            res1 = [str(index)]
            index += 1
            for i in [2,3,5,6,7,8,9,10,11,12,13,14]:
                res1.append(tr.find_element(By.CSS_SELECTOR,"td:nth-child(" + str(i)+ ")").text)
            print(res1)
            res.append(res1)
        if page <= max_page - 2 :
            next_button = driver.find_element(By.XPATH,'/html/body/div[1]/div[2]/div[2]/div[5]/div/div[2]/div/a[2]')
            webdriver.ActionChains(driver).move_to_element(next_button ).click(next_button ).perform()
            time.sleep(1.5)


edge_options = Options()
driver = webdriver.Edge(options=edge_options)
res = []

driver.get("http://quote.eastmoney.com/center/gridlist.html#sh_a_board")
try:
    spider()
except Exception as e:
    print("------------------------------------error---------------------------------------")
    print(e)

driver.execute_script("window.open('http://quote.eastmoney.com/center/gridlist.html#sz_a_board','_self');")
time.sleep(1.5)
try:
    spider()
except Exception as e:
    print("------------------------------------error---------------------------------------")
    print(e)
# driver.quit()

driver.execute_script("window.open('http://quote.eastmoney.com/center/gridlist.html#bj_a_board','_self');")
time.sleep(1.5)
try:
    spider()
except Exception as e:
    print("------------------------------------error---------------------------------------")
    print(e)

db = sqlite3.connect('w0.db')
sql_text = '''CREATE TABLE scores
                (序号 TEXT,
                    股票代码 TEXT,
                    股票名称 TEXT,
                    最新报价 TEXT,
                    涨跌幅 TEXT,
                    涨跌额 TEXT,
                    成交量 TEXT,
                    成交额 TEXT,
                    振幅 TEXT,
                    最高 TEXT,
                    最低 TEXT,
                    今开 TEXT,
                    昨收 TEXT);'''
db.execute(sql_text)
db.commit()

for item in res:
    sql_text = "INSERT INTO scores VALUES('"+item[0] + "'"
    
    for i in range(len(item) - 1):
        sql_text  = sql_text + ",'" + item[i+1] + "'"

    sql_text = sql_text +  ")"
    print(sql_text)
    db.execute(sql_text)
    db.commit()
    
db.close()

结果:
image

心得体会:没第二题难。

gitee仓库链接:gitee仓库链接

作业②

  • 熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
  • 使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
from selenium.webdriver.edge.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import sqlite3



def login():
    next_button = driver.find_element(By.XPATH,'//*[@id="j-topnav"]/div')
    webdriver.ActionChains(driver).move_to_element(next_button ).click(next_button ).perform()
    next_button = driver.find_element(By.XPATH,'/html/body/div[12]/div[2]/div/div/div/div/div/div[1]/div/div[2]/div[1]/a[1]')
    webdriver.ActionChains(driver).move_to_element(next_button ).click(next_button ).perform()
    time.sleep(2)
    driver.switch_to.frame('ptlogin_iframe')
    next_button = driver.find_element(By.CSS_SELECTOR,'#img_out_539943419')
    webdriver.ActionChains(driver).move_to_element(next_button ).click(next_button ).perform()
    time.sleep(15)

    

def spider():
    # PPT也就爬了一个。。。
    driver.execute_script("window.open('https://www.icourse163.org/course/ZJU-199001','_self');")
    time.sleep(5)
    name = driver.find_element(By.XPATH,'/html/body/div[4]/div[2]/div[1]/div/div[3]/div/div[1]/div[1]/span[1]').text
    teacher = driver.find_element(By.XPATH,'/html/body/div[4]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/div/div[2]/div/div/div/div/div/h3').text
    school = driver.find_element(By.XPATH,"/html/body/div[4]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div/a/img").alt
    count = driver.find_element(By.XPATH,'/html/body/div[4]/div[2]/div[1]/div/div[3]/div/div[2]/div/div[1]/div[4]/span[2]').text
    time1 = driver.find_element(By.XPATH,'//*[@id="course-enroll-info"]/div/div[1]/div[2]/div/span[2]').text
    brief = driver.find_element(By.XPATH,'//*[@id="j-rectxt2"]').text
    print(name,teacher,school,count,time1,brief)
    item = [str(1),name,school,teacher,teacher,count,time1,brief]
    sql_text = "INSERT INTO mooc VALUES('"+item[0] + "'"
    for i in range(len(item) - 1):
        sql_text  = sql_text + ",'" + item[i+1] + "'"

    sql_text = sql_text +  ")"
    print(sql_text)
    db.execute(sql_text)
    db.commit()
    
db = sqlite3.connect('w1.db')
sql_text = '''CREATE TABLE mooc
                (id TEXT,
                    cCourse TEXT,
                    cCollege TEXT,
                    cTeacher TEXT,
                    cTeam TEXT,
                    cCount TEXT,
                    cProcess TEXT,
                    cBrief TEXT);'''
db.execute(sql_text)
db.commit()

edge_options = Options()
driver = webdriver.Edge(options=edge_options)
res = []

driver.get("https://www.icourse163.org/home.htm?userId=1482874151#/home/spocCourse")
try:
    login()
except Exception as e:
    print("------------------------------------error---------------------------------------")
    print(e)

try:
    spider()
except Exception as e:
    print("------------------------------------error---------------------------------------")
    print(e)
time.sleep(999)
    
db.close()

结果:
image

心得体会:比第一题难,大伙都去整输入,感觉不如模拟点击扣扣登录hhh。

gitee仓库链接:gitee仓库链接

作业③

  • 掌握大数据相关服务,熟悉Xshell的使用
  • 完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。
    完成代码:详见华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx
    完成截图:
    image
    image
    image
    image
    image

心得体会:复制黏贴还能复制出心得体会?