数据采集第四次实践作业

发布时间 2023-11-16 11:49:41作者: 风宝风宝世界最好

数据采集第四次实践作业
作业一
要求:
熟练掌握 Selenium 查找HTML元素、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+ MySQL数据库存储技术路线爬取“沪深A股”、“上证A股”、“深证A股”3个板块的股票数据信息。
候选网站:东方财富网:http://quote.eastmoney.com/center/gridlist.html#hs_a_board

输出信息:MYSQL数据库存储和输出格式如下,表头应是英文命名例如:序号id,股票代码:bStockNo……,由同学们自行定义设计表头:

Gitee文件夹链接

部分代码如下:
data = response.text selector = scrapy.Selector(text=data) for cnt in range(2, 12): item = Item() cy = selector.xpath("//table[@class='data']/tr[%d]" % cnt) #序号 item['id'] = cnt - 1 #名称 item['Currency'] = cy.xpath("./td[@class='fontbold'][1]").xpath("string()").extract_first() item['Currency'] = str(item['Currency']).strip() #TSP item['TSP'] = cy.xpath("./td[@class='numberright'][1]").xpath("string()").extract_first() item['TSP'] = str(item['TSP']).strip() #CSP item['CSP'] = cy.xpath("./td[@class='numberright'][2]").xpath("string()").extract_first() item['CSP'] = str(item['CSP']).strip() #TBP item['TBP'] = cy.xpath("./td[@class='numberright'][3]").xpath("string()").extract_first() item['TBP'] = str(item['TBP']).strip() #CBP item['CBP'] = cy.xpath("./td[@class='numberright'][4]").xpath("string()").extract_first() item['CBP'] = str(item['CBP']).strip() #当前时间 item['time'] = cy.xpath("./td[8]").xpath("string()").extract_first() item['time'] = str(item['time']).strip() yield item cnt += 1
2.2.4 pipelines.py
首先编写数据库类,有打开数据库,插入数据,关闭数据库三种方法

`class CurrencyDB:
def openDB(self):
self.con = sqlite3.connect("currencys.db") #建立数据库链接,若没有对应数据库则创建
self.cursor = self.con.cursor() #建立游标
try:
self.cursor.execute("create table currencys "
"(Cid int(4),Ccurrency varchar(16),"
"Ctsp varchar(32),Ccsp varchar(64),"
"Ctbp varchar(32),Ccbp varchar(8),Ctime varchar(64))")
except:
self.cursor.execute("delete from currencys")

def closeDB(self):
    self.con.commit()
    self.con.close()

def insert(self, Cid, Ccurrency, Ctsp, Ccsp, Ctbp, Ccbp, Ctime):
    try:
        self.cursor.execute("insert into currencys(Cid, Ccurrency, Ctsp, Ccsp, Ctbp, Ccbp, Ctime) values (?,?,?,?,?,?,?)",
                            (Cid, Ccurrency, Ctsp, Ccsp, Ctbp, Ccbp, Ctime))
    except Exception as err:
        print(err)`

结果:

作业二:
要求:
熟练掌握 Selenium 查找HTML元素、实现用户模拟登录、爬取Ajax网页数据、等待HTML元素等内容。
使用Selenium框架+MySQL爬取中国mooc网课程资源信息(课程号、课程名称、学校名称、主讲教师、团队成员、参加人数、课程进度、课程简介)
候选网站:中国mooc网:https://www.icourse163.org

输出信息:MYSQL数据库存储和输出格式

Gitee文件夹链接

代码如下:
`from time import daylight, sleep
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pymysql
import datetime
import time
from selenium.webdriver.common.keys import Keys
from lxml import etree

class MySpider:
headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
}
count = 1

def startUp(self,url):
    # # Initializing Chrome browser
    chrome_options = Options()
    # chrome_options.add_argument('--headless')
    # chrome_options.add_argument('--disable-gpu')
    self.driver = webdriver.Chrome(options=chrome_options)
    try:
        self.con = pymysql.connect(host = "127.0.0.1",port = 3306,user = "root",passwd = "********",db = "mydb",charset = "utf8")
        self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
        #self.cursor.execute("delete from stock") #如果表已经存在就删掉
        self.opened = True
        self.page_num = 1
    except Exception as err:
        print("连接数据库失败")
        self.opened = False
    self.driver.get(url)
    #这里的search框用id属性显示定位不到
    #1.如果一个元素,它的属性不是很明显,无法直接定位到,这时候我们可以先找它老爸(父元素)
    #2.找到它老爸后,再找下个层级就能定位到了
    #3.要是它老爸的属性也不是很明显
    time.sleep(2)
    accession = self.driver.find_element_by_xpath('//*[@id="app"]/div/div/div[1]/div[3]/div[3]/div')
    # print(accession)
    # time.sleep(3)
    accession.click()
    time.sleep(3)
    #其它登入方式
    other = self.driver.find_element_by_xpath('//div[@class="ux-login-set-scan-code_ft"]/span')
    other.click()
    time.sleep(5)
    self.driver.switch_to.frame(self.driver.find_element_by_xpath("//iframe[starts-with(@id, 'x-URS-iframe')]"))
    email = self.driver.find_element_by_xpath('//input[@name="email"]')
    email.send_keys('2105114977@qq.com')
    time.sleep(2)
    pswd = self.driver.find_element_by_xpath('//input[@name="password"]')
    pswd.send_keys('********')
    time.sleep(2)
    go = self.driver.find_element_by_xpath('//div[@class="f-cb loginbox"]/a')
    go.click()
    time.sleep(5)
    search = self.driver.find_element_by_xpath('//div[@class="u-baseinputui"]/input[@type="text"]')  #找到搜索的input框
    time.sleep(3)
    search.send_keys(key)
    search.send_keys(Keys.ENTER) 
  

def closeUp(self):
    try:
        if(self.opened):
            self.con.commit()
            self.con.close()
            self.opened = False
        self.driver.close()
        print("爬取完毕,关闭数据库")
    except Exception as err:
        print("关闭数据库失败")

def insertDB(self,id,Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief):
    try:
        self.cursor.execute("insert into mooc(id,Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief) values (%s,%s,%s,%s,%s,%s,%s,%s)",
        (str(self.count),Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief))
    except Exception as err:
        print("插入数据失败!",err)

def processSpider(self):
    time.sleep(3)
    print(self.driver.current_url)
    div_list = self.driver.find_elements_by_xpath('//div[@class="m-course-list"]/div/div[@class="u-clist f-bgw f-cb f-pr j-href ga-click"]')
    #print(len(div_list))
    for div in div_list:
        Mcourse = div.find_element_by_xpath('.//div[@class="t1 f-f0 f-cb first-row"]').text
        Mcollege = div.find_element_by_xpath('.//a[@class="t21 f-fc9"]').text
        Mteacher = div.find_element_by_xpath('.//a[@class="f-fc9"]').text
        Mcount = div.find_element_by_xpath('.//span[@class="hot"]').text
        #详情页,这里通过点击图片来实现
        detail = div.find_element_by_xpath('.//div[@class="u-img f-fl"]')
        detail.click()
        time.sleep(2)  #等待详情页面加载,准备爬取我们所需要的其它内容
        windows = self.driver.window_handles  #获取当前所有页面句柄
        self.driver.switch_to.window(windows[1])  #切换到刚刚点击的新页面
        teams = self.driver.find_elements_by_xpath('//div[@class="um-list-slider_con_item"]//h3[@class="f-fc3"]')
        Mteam = ""
        for team in teams:
            Mteam = Mteam + team.text
            if(team!=teams[-1]):
                Mteam += " "
        #print(Mteam)  #测试团队爬取情况
        Mprocess = self.driver.find_element_by_xpath('//div[@class="course-enroll-info_course-info_term-info"]//div[@class="course-enroll-info_course-info_term-info_term-time"]/span[2]').text
        Mbrief = self.driver.find_element_by_xpath('//div[@class="course-heading-intro_intro"]').text
        #print(Mprocess,Mbrief)
        
        self.driver.close()  #把详情页窗口关闭
        self.driver.switch_to.window(windows[0])  #切换回最初的页面
        #通过观察浏览器的行为,确实可以爬取到我们所需要的所有信息了,接下来只要把我们的数据存储到数据库即可
       
        #print(self.count,Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief)
        self.insertDB(str(self.count),Mcourse,Mcollege,Mteacher,Mteam,Mcount,Mprocess,Mbrief)
        self.count += 1
    #爬取完毕一个页面的数据了,这里就爬取2页的数据
    #这题先找disable的话在第一页会先点击上一页
    try:
        next_button = self.driver.find_element_by_xpath('//li[@class="ux-pager_btn ux-pager_btn__next"]//a[@class="th-bk-main-gh"]')
        #print(next_button)
        next_button.click()
        time.sleep(2)
        if(self.page_num<2):
            self.page_num += 1 
            self.processSpider()
    except:
        self.driver.find_element_by_xpath('//li[@class="ux-pager_btn ux-pager_btn__next"]//a[@class="th-bk-disable-gh"]')  #当前页面是否是最后一页

key = "机器学习"
url = "https://www.icourse163.org"
myspider = MySpider()
start = datetime.datetime.now()
myspider.startUp(url)
myspider.processSpider()
myspider.closeUp()
end = datetime.datetime.now()
print("一共爬取了"+str(myspider.count-1)+"条数据,"+"一共花费了"+str(end-start)+"的时间")`
结果:

作业③:
要求:
掌握大数据相关服务,熟悉Xshell的使用

完成文档 华为云_大数据实时分析处理实验手册-Flume日志采集实验(部分)v2.docx 中的任务,即为下面5个任务,具体操作见文档。

环境搭建
任务一:开通MapReduce服务
实时分析开发实战:
任务一:Python脚本生成测试数据
任务二:配置Kafka
任务三:安装Flume客户端
任务四:配置Flume采集数据
输出:实验关键步骤或结果截图。

结果截图:
开通MapReduce服务

任务一:Python脚本生成测试数据

任务二:配置Kafka

任务三:安装Flume客户端

·任务四:配置Flume采集数据