数据采集实践第三次作业

发布时间 2023-10-24 16:05:37作者: 人生几何5

作业①

实验要求

指定一个网站,爬取这个网站中的所有的所有图片,例如:中国气象网(http://www.weather.com.cn)。使用scrapy框架分别实现单线程和多线程的方式爬取。

–务必控制总页数(学号尾数2位)、总下载的图片数量(尾数后3位)等限制爬取的措施。
输出信息:
将下载的Url信息在控制台输出,并将下载的图片存储在images子文件中,并给出截图。
码云连接:https://gitee.com/EI-Shaddoll-Midrash/2023-data-collection/tree/master/作业3/作业3_1

代码展示:

单线程:

MySpider部分:

import scrapy
import urllib.request
from caiji.shiyan3.demo.demo.items import DemoItem
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
class MySpider(scrapy.Spider):
    name = "mySpider"
    
    def start_requests(self):
        url = "http://www.weather.com.cn/"
        yield scrapy.Request(url=url, callback=self.parse)
    
    def parse(self, response):
        global item
        count = 1
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "html.parser")
            images = soup.select("img")  # 获取图像标签
            for image in images:
                src = image["src"]
                #print(image)
                try:
                    if (src[len(src) - 4] == "."):
                        ext = src[len(src) - 4:]
                    else:
                        ext = ""
                    req = urllib.request.Request(src)
                    data = urllib.request.urlopen(req, timeout=100)
                    data = data.read()
                    fobj = open("D:/pythonProject/caiji/shiyan3/images/" + str(count) + ext, "wb")
                    fobj.write(data)
                    fobj.close()
                    count +=1
                except Exception as err:
                    print(err)
                item = DemoItem()
                item["name"] = src.strip() if src else ""
                yield item
        except Exception as err:
            print(err)

items部分:

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class DemoItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    pass

pipelines部分

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class DemoPipeline:
    def process_item(self, item, spider):
        try:
            print(item["name"])
        except Exception as err:
            print(err)
        return item

settings部分

BOT_NAME = "demo"

SPIDER_MODULES = ["demo.spiders"]
NEWSPIDER_MODULE = "demo.spiders"

ITEM_PIPELINES = {
'demo.pipelines.DemoPipeline': 300,
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "demo (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

多线程

MySpider部分

import threading

import scrapy
import urllib.request
from caiji.shiyan3.demo1.demo1.items import Demo1Item
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit

class MySpider(scrapy.Spider):
    name = "mySpider1"
    
    def start_requests(self):
        url = "http://www.weather.com.cn/"
        yield scrapy.Request(url=url, callback=self.parse)
    
    def download(self,url,count):
        # 下载图片
        try:
            if (url[len(url) - 4] == "."):
                ext = url[len(url) - 4:]
            else:
                ext = ""
            req = urllib.request.Request(url)
            data = urllib.request.urlopen(req, timeout=100)
            data = data.read()
            fobj = open("D:/pythonProject/caiji/shiyan3/images/" + str(count) + ext, "wb")
            fobj.write(data)
            fobj.close()
            count += 1
        except Exception as err:
            print(err)
    def parse(self, response):
        global item
        count = 0
        threads = []
        srcs=[]
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "html.parser")
            images = soup.select("img")  # 获取图像标签
            for image in images:
                src = image["src"]
                #item中的name
                item = Demo1Item()
                item["name"] = src.strip() if src else ""
                if src not in srcs:
                    # print(url)
                    count = count + 1
                    T = threading.Thread(target=self.download, args=(src, count))
                    T.daemon = False
                    T.start()
                    threads.append(T)
                yield item
            for t in threads:
                t.join()
        except Exception as err:
            print(err)

items部分

import scrapy
class Demo1Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name = scrapy.Field()
    pass

pipelines部分

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class DemoPipeline:
    def process_item(self, item, spider):
        try:
            print(item["name"])
        except Exception as err:
            print(err)
        return item

settings部分

BOT_NAME = "demo1"

SPIDER_MODULES = ["demo1.spiders"]
NEWSPIDER_MODULE = "demo1.spiders"

ITEM_PIPELINES = {
'demo1.pipelines.DemoPipeline': 300,
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "demo1 (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

运行结果



心得体会

这个实验融合了之前做过的多线程和单线程下载图片,还是比较简单的

作业②

实验要求

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。

候选网站:东方财富网:https://www.eastmoney.com/
输出信息:
码云连接:https://gitee.com/EI-Shaddoll-Midrash/2023-data-collection/tree/master/作业3/作业3_2

代码展示

MySpider部分


import scrapy
from lxml import etree
from caiji.shiyan3.demo2.demo2.items import Demo2Item
import time
from selenium import webdriver

class MySpider(scrapy.Spider):
    name = "mySpider2"
    edge_options = webdriver.EdgeOptions()
    edge_options.add_argument('--headless')
    edge_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=edge_options)
    
    # 定义start_requests方法,生成对特定URL的请求并返回该请求
    def start_requests(self):
        url = 'https://quote.eastmoney.com/center/gridlist.html#hs_a_board'  # 要爬取的网页URL
        self.driver.get(url)
        time.sleep(1)  # 等待页面加载完毕
        html = etree.HTML(self.driver.page_source)  # 获取网页HTML内容
        yield scrapy.Request(url, self.parse, meta={'html': html})
        
        # 定义parse方法,用于处理服务器返回的响应
    
    def parse(self, response):
        # 定义一个全局变量item,这样在处理响应之外的地方也可以引用到这个变量
        global item
        html = response.meta['html']
        # 获取元素
        lis = html.xpath('//*[@id="table_wrapper-table"]/tbody/tr')
        for link in lis:
            stock_id = link.xpath('./td[2]/a/text()')[0]
            stock_name = link.xpath('./td[3]/a/text()')[0]
            newest_price = link.xpath('./td[5]/span/text()')[0]
            up_down_extent = link.xpath('./td[6]/span/text()')[0]
            up_down_value = link.xpath('./td[7]/span/text()')[0]
            deal_value = link.xpath('./td[8]/text()')[0]
            freq = link.xpath('./td[10]/text()')[0]
            highest = link.xpath('./td[11]/span/text()')[0]
            lowest = link.xpath('./td[12]/span/text()')[0]
            opening = link.xpath('./td[13]/span/text()')[0]
            zt = link.xpath('./td[14]/text()')[0]
            item = Demo2Item()
            item['stock_id'] = stock_id
            item['stock_name'] = stock_name
            item['newest_price'] = newest_price
            item['up_down_extent'] = up_down_extent
            item['up_down_value'] = up_down_value
            item['deal_value'] = deal_value
            item['freq'] = freq
            item['highest'] = highest
            item['lowest'] = lowest
            item['opening'] = opening
            item['zt'] = zt
            yield item

items部分

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class Demo2Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    stock_id = scrapy.Field()
    stock_name = scrapy.Field()
    newest_price = scrapy.Field()
    up_down_extent = scrapy.Field()
    up_down_value = scrapy.Field()
    deal_volume = scrapy.Field()
    deal_value = scrapy.Field()
    freq = scrapy.Field()
    highest = scrapy.Field()
    lowest = scrapy.Field()
    opening = scrapy.Field()
    zt = scrapy.Field()
    pass

pipelines部分

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
import pymysql

class DemoPipeline:


    def process_item(self, item, spider):
        # print(
        #     item['stock_id'], item['stock_name'], item['newest_price'], item['up_down_extent'], item['up_down_value'],
        #     item['deal_value'], item['freq'], item['highest'],
        #     item['lowest'], item['opening'], item['zt'])
        # 使用数据库录入数据
        try:
            mydb = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="mydb",charset="utf8")
            mycursor = mydb.cursor()
            sql = "INSERT INTO demo2 (stock_id, stock_name, newest_price, up_down_extent, up_down_value,deal_value, " \
                  "freq, highest, lowest, opening, zt) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
            val = (item['stock_id'], item['stock_name'], item['newest_price'], item['up_down_extent'], item['up_down_value'],item['deal_value'], item['freq'], item['highest'],item['lowest'], item['opening'], item['zt'])
            mycursor.execute(sql, val)
            mydb.commit()
        except Exception as err:
            print(err)
        return item

settings部分

BOT_NAME = "demo2"

SPIDER_MODULES = ["demo2.spiders"]
NEWSPIDER_MODULE = "demo2.spiders"

ITEM_PIPELINES = {
'demo2.pipelines.DemoPipeline': 300,
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "demo2 (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

运行结果

心得体会

这个实验比实验一难了许多,需要使用pymysql库连接数据库。然后对数据库进行操作。刚开始发现pipelines能输出股票数据但数据库中没有股票数据,发现原因是在数据库操作系统中没有建立相应的数据库和相对应的表,完成后成功实现。

作业③

实验要求

熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法;使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。

候选网站:东方财富网:https://www.eastmoney.com/
输出信息:
码云链接:https://gitee.com/EI-Shaddoll-Midrash/2023-data-collection/tree/master/作业3/作业3_3

代码展示

MySpider部分

import time
import scrapy
from lxml import etree
from caiji.shiyan3.demo3.demo3.items import Demo3Item
from selenium import webdriver

class MySpider(scrapy.Spider):
    name = "mySpider3"
    edge_options = webdriver.EdgeOptions()
    edge_options.add_argument('--headless')
    edge_options.add_argument('--disable-gpu')
    
    driver = webdriver.Chrome(options=edge_options)
    
    # 定义start_requests方法,生成对特定URL的请求并返回该请求
    def start_requests(self):
        url = 'https://www.boc.cn/sourcedb/whpj/'  # 要爬取的网页URL
        self.driver.get(url)
        time.sleep(1)  # 等待页面加载完毕
        html = etree.HTML(self.driver.page_source)  # 获取网页HTML内容
        yield scrapy.Request(url, self.parse, meta={'html': html})
        
        # 定义parse方法,用于处理服务器返回的响应
    def parse(self, response):
        # 定义一个全局变量item,这样在处理响应之外的地方也可以引用到这个变量
        global item
        html = response.meta['html']
        lis = html.xpath('/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr')
        i = 1
        #获取元素
        for link in lis:
            if i != 1:
                texts = link.xpath('./td[1]/text()')
                if texts:
                    name = texts[0]
                else:
                    name = ''
                texts = link.xpath('./td[2]/text()')
                if texts:
                    TBP = texts[0]
                else:
                    TBP = ''
                texts = link.xpath('./td[3]/text()')
                if texts:
                    CBP = texts[0]
                else:
                    CBP = ''
                texts = link.xpath('./td[4]/text()')
                if texts:
                    TSP = texts[0]
                else:
                    TSP = ''
                texts = link.xpath('./td[5]/text()')
                if texts:
                    CSP = texts[0]
                else:
                    CSP = ''
                texts = link.xpath('./td[8]/text()')
                if texts:
                    TIME = texts[0]
                else:
                    TIME = ''
                
                item = Demo3Item()
                item["currency"] = name
                item["TBP"] = TBP
                item["CBP"] = CBP
                item["TSP"] = TSP
                item["CSP"] = CSP
                item["time"] = TIME
                yield item
            if i == 1:
                i = i + 1

items部分

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
class Demo3Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    currency = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    time = scrapy.Field()
    pass

pipelines部分

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql

class DemoPipeline:
    def process_item(self, item, spider):
        try:
            print(item["currency"])
            print(item["TSP"])
            print(item["CSP"])
            print(item["TBP"])
            print(item["CBP"])
            print(item["time"])
            print()
            # 将数据插入数据库的表中
            mydb = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="mydb", charset="utf8")
            mycursor = mydb.cursor()
            sql="insert into currency (currency,TSP,CSP,TBP,CBP,time) values (%s,%s,%s,%s,%s,%s)"
            val=(item["currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["time"])
            mycursor.execute(sql, val)
            mydb.commit()
        except Exception as err:
            print(err)
        return item

settings部分

BOT_NAME = "demo3"

SPIDER_MODULES = ["demo3.spiders"]
NEWSPIDER_MODULE = "demo3.spiders"

ITEM_PIPELINES = {
'demo3.pipelines.DemoPipeline': 300,
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "demo3 (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

输出结果

心得体会

这个实验和实验2类似,但元素获取需要甄别。