作业1

要求：指定一个网站，爬取这个网站中的所有的所有图片，例如：中国气象网（http://www.weather.com.cn）。使用scrapy框架分别实现单线程和多线程的方式爬取。
务必控制总页数（学号尾数2位）、总下载的图片数量（尾数后3位）等限制爬取的措施。
文件夹链接：https://gitee.com/scholaar/crawl_project/tree/master/第三次实践作业

代码

items.py

class work1_Item(scrapy.Item):

    img_url = scrapy.Field()

pipelines.py

class ImgSpiderPipeline:
    count = 0
    images_store = "C:\\Users\\zmk\\PycharmProjects\\pythonProject\\ImgSpider"
    threads = []

    def open_spider(self, spider):
        picture_path = self.images_store + '\\images'
        if os.path.exists(picture_path):  # 判断文件夹是否存在
            for root, dirs, files in os.walk(picture_path, topdown=False):
                for name in files:
                    os.remove(os.path.join(root, name))  # 删除文件
                for name in dirs:
                    os.rmdir(os.path.join(root, name))  # 删除文件夹
            os.rmdir(picture_path)  # 删除文件夹
        os.mkdir(picture_path)  # 创建文件夹

单线程（写在pipelines中）

    # 单线程
    def process_item(self, item, spider):
        url = item['img_url']
        print(url)
        
        # 创建文件夹逻辑
        single_threaded_path = '.\\images\\Single threaded'
        os.makedirs(single_threaded_path, exist_ok=True)
        
        img_data = urllib.request.urlopen(url=url).read()
        img_path = os.path.join(single_threaded_path, f'{self.count}.jpg')
        with open(img_path, 'wb') as fp:
            fp.write(img_data)
        self.count = self.count + 1
        return item

多线程（写在pipelines中）

# 多线程
    def process_item(self, item, spider):
        url = item['img_url']
        print(url)
        T=threading.Thread(target=self.download_img,args=(url,))
        T.setDaemon(False)
        T.start() 
        self.threads.append(T)
        return item

    def download_img(self, url):
        # 创建文件夹逻辑
        thread_path = os.path.join('.\\images\\Thread')
        os.makedirs(thread_path, exist_ok=True)

        img_data = urllib.request.urlopen(url=url).read()
        img_path = os.path.join(thread_path, f'{self.count}.jpg')
        with open(img_path, 'wb') as fp:
            fp.write(img_data)
        self.count = self.count + 1

    def close_spider(self,spider):
        for t in self.threads:
            t.join()

spider.py

import scrapy
from demo.items import work1_Item

class Work1Spider(scrapy.Spider):
    name = 'work1'
    # allowed_domains = ['www.weather.com.cn']
    start_urls = ['http://www.weather.com.cn/']

    def parse(self, response):
        data = response.body.decode()
        selector=scrapy.Selector(text=data) 
        img_datas = selector.xpath('//a/img/@src')
        for img_data in img_datas:
            item = work1_Item()
            item['img_url'] = img_data.extract()
            yield item

运行结果

心得体会

在多线程的方面卡了一些时间，对于多线程还是比较不熟练

作业2

要求：
熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取股票相关信息。
候选网站：东方财富网：https://www.eastmoney.com/
输出信息：
MySQL数据库存储和输出格式如下：
表头英文命名例如：序号id，股票代码：bStock
文件夹链接：https://gitee.com/scholaar/crawl_project/tree/master/第三次实践作业

代码

items.py

class StockItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    code = scrapy.Field()
    name = scrapy.Field()
    new_price = scrapy.Field()
    price_limit = scrapy.Field()
    change_amount = scrapy.Field()
    turnover = scrapy.Field()
    volume = scrapy.Field()
    rise = scrapy.Field()
    highest = scrapy.Field()  # 最高
    lowest = scrapy.Field()  # 最低
    today_open = scrapy.Field()  # 今开
    yesterday_receive = scrapy.Field()  # 昨收
    pass

pipelines.py

import sqlite3

class StockPipeline:
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = sqlite3.connect('stocks.db')  # 连接到名为 stocks.db 的数据库文件
            self.cursor = self.con.cursor()
            self.cursor.execute('''
                CREATE TABLE IF NOT EXISTS stocks (
                    id INTEGER PRIMARY KEY,
                    code TEXT,
                    name TEXT,
                    new_price TEXT,
                    price_limit TEXT,
                    change_amount TEXT,
                    turnover TEXT,
                    volume TEXT,
                    rise TEXT,
                    highest TEXT,
                    lowest TEXT,
                    today_open TEXT,
                    yesterday_receive TEXT
                )
            ''')
            self.opened = True
        except Exception as err:
            print(err)
            self.opened = False

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")

    def process_item(self, item, spider):
        try:
            self.cursor.execute('''
                INSERT INTO stocks (
                    code, name, new_price, price_limit, change_amount,
                    turnover, volume, rise, highest, lowest, today_open, yesterday_receive
                )
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                item['code'], item['name'], item['new_price'], item['price_limit'],
                item['change_amount'], item['turnover'], item['volume'], item['rise'],
                item['highest'], item['lowest'], item['today_open'], item['yesterday_receive']
            ))
            self.con.commit()
        except Exception as err:
            print(err)
        return item

spider.py

import json
from shijian3_2.items import StockItem
import scrapy

class MySpider(scrapy.Spider):
    name = 'stocks'

    def parse(self, response):
        url = [
        'http://64.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124019574933852728926_1697704464511&pn=1&pz=20&po=1&np=2&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1697704464512']
        try:
            data = response.body.decode('utf-8')
            print(data)
            data = data[41:-2]  
            responseJson = json.loads(data)
            stocks = responseJson.get('data').get('diff')
            for stock in stocks:
                item = StockItem()
                item['code'] = stock.get('f12')
                item['name'] = stock.get('f14')
                item['new_price'] = stock.get('f2')
                item['price_limit'] = stock.get('f3')
                item['change_amount'] = stock.get('f4')
                item['turnover'] = stock.get('f5')
                item['volume'] = stock.get('f6')
                item['rise'] = stock.get('f7')
                item['highest'] = stock.get('f15')
                item['lowest'] = stock.get('f16')
                item['today_open'] = stock.get('f17')
                item['yesterday_receive'] = stock.get('f18')
                yield item

            yield scrapy.Request(url, callback=self.parse)
        except Exception as err:
            print(err)

运行结果

保存为db文件：

保存为csv文件方便查看：

心得体会

有了上一次的作业的打底，这个写起来轻松一点，但是把一份代码转化成scrapy框架还是让我做了好久

作业3

要求：熟练掌握 scrapy 中 Item、Pipeline 数据的序列化输出方法；使用scrapy框架+Xpath+MySQL数据库存储技术路线爬取外汇网站数据。
候选网站：中国银行网：https://www.boc.cn/sourcedb/whpj/
文件夹链接：https://gitee.com/scholaar/crawl_project/tree/master/第三次实践作业

代码

items.py

import scrapy

class MoneyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    currency = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    Time = scrapy.Field()

pipelines.py

import sqlite3

class MoneyPipeline(object):
    def open_spider(self, spider):
        print("opened")
        try:
            self.con = sqlite3.connect('new_money.db')  # 创建或连接到新数据库文件
            self.cursor = self.con.cursor()
            self.cursor.execute("CREATE TABLE IF NOT EXISTS changes ("
                                "bId INTEGER PRIMARY KEY AUTOINCREMENT,"
                                "bCurrency TEXT,"
                                "bTSP REAL,"
                                "bCSP REAL,"
                                "bTBP REAL,"
                                "bCBP REAL,"
                                "bTime TEXT)")  # 创建表格
            self.opened = True
        except Exception as err:
            print(err)
            self.opened = False

    def close_spider(self, spider):
        if self.opened:
            self.con.commit()
            self.con.close()
            self.opened = False
        print("closed")

    def process_item(self, item, spider):
        try:
            print("货币名称\t", "现汇买入价\t", "现钞买入价\t ", "现汇卖出价\t ", "现钞卖出价\t", "时间\t")
            print("{:^10}{:>10}{:>10}{:>10}{:>12}{:>13}".format(item["currency"],item["TSP"],item["CSP"],item["TBP"],item["CBP"],item["Time"]))
            if self.opened:
                self.cursor.execute("INSERT INTO changes (bCurrency,bTSP,bCSP,bTBP,bCBP,bTime) VALUES (?,?,?,?,?,?)",
                                    (item["currency"],item["TSP"],item["CSP"],item["TBP"],item["CBP"],item["Time"]))

        except Exception as err:
            print(err)
        return item

spider.py

import scrapy
from bs4 import UnicodeDammit
from shijian3_3.items import MoneyItem

class ChangeSpider(scrapy.Spider):
    name = 'change'

    def start_requests(self):
        url = 'https://www.boc.cn/sourcedb/whpj/'
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            money = selector.xpath("/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr[2]/td[1]")
            for moneyitem in money[1:]:    # 处理表头
                currency = moneyitem.xpath("./td[@class='fontbold']/text()").extract_first()
                TSP = moneyitem.xpath("/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr[2]/td[2]").extract_first()
                CSP = moneyitem.xpath("/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr[2]/td[3]").extract_first()
                TBP = moneyitem.xpath("/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr[2]/td[4]").extract_first()
                CBP = moneyitem.xpath("/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr[2]/td[5]").extract_first()
                Time = moneyitem.xpath("/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr[2]/td[7]").extract_first()
                item = MoneyItem()
                item["currency"] = currency.strip() if currency else ""   # 处理空标签
                item["TSP"] = TSP.strip() if TSP else ""
                item["CSP"] = CSP.strip() if CSP else ""
                item["TBP"] = TBP.strip() if TBP else ""
                item["CBP"] = CBP.strip() if CBP else ""
                item["Time"] = Time.strip() if Time else ""
                yield item
        except Exception as err:
            print(err)