数据采集与融合技术实践第二次作业

发布时间 2023-10-18 11:31:40作者: 沐沐沐168

作业①

实验要求

·在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。

·Gitee文件夹链接: [https://gitee.com/dong-qi168/sjcjproject/blob/master/作业2/天气]

实验内容

·代码展示

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3
import time

class WeatherDB:
    def __init__(self, db_file):
        self.db_file = db_file
        self.con = None
        self.cursor = None

    def openDB(self):
        self.con = sqlite3.connect(self.db_file)
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("""
                CREATE TABLE IF NOT EXISTS weathers (
                    wCity VARCHAR(16),
                    wDate VARCHAR(16),
                    wWeather VARCHAR(64),
                    wTemp VARCHAR(32),
                    CONSTRAINT pk_weather PRIMARY KEY (wCity, wDate)
                )
            """)
        except Exception as err:
            print(err)

    def closeDB(self):
        if self.con:
            self.con.commit()
            self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("""
                INSERT INTO weathers (wCity, wDate, wWeather, wTemp)
                VALUES (?, ?, ?, ?)
            """, (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("SELECT * FROM weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
        }
        self.cityCode = {
            "北京": "101010100",
            "上海": "101020100",
            "广州": "101280101",
            "深圳": "101280601",
            "福州": "101230101"
        }

    def forecastCity(self, city, db):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return

        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis[:7]:  # 获取七天的天气数据
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities, db_file):
        db = WeatherDB(db_file)
        db.openDB()

        for city in cities:
            self.forecastCity(city, db)

        db.closeDB()


ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳", "福州"], "weathers.db")

time.sleep(100)

·运行结果

·使用sqlite3查看保存的数据

心得体会

通过对课例的实践,对bs库有了更深的认识,学会了使用sqlite3查看保存的数据。

作业②

实验要求

·要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。

·技巧:在谷歌浏览器中进入 F12 调试模式进行抓包,查找股票列表加载使用的 url,并分析 api 返回的值,并根据所要求的参数可适当更改api 的请求参数。根据 URL 可观察请求的参数 f1,f2 可获取不同的数值,根据情况可删减请求的参数。

·Gitee文件夹链接: [https://gitee.com/dong-qi168/sjcjproject/blob/master/作业2/股票]

实验内容

·代码展示

import requests
import json
import sqlite3

class MoneyDB:
    def __init__(self, db_file):
        self.db_file = db_file
        self.con = None
        self.cursor = None

    def openDB(self):
        self.con = sqlite3.connect(self.db_file)
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("""
                CREATE TABLE IF NOT EXISTS moneys (
                    序号 VARCHAR(64),
                    代码 VARCHAR(64),
                    名称 VARCHAR(64),
                    报价 VARCHAR(64),
                    涨跌幅 VARCHAR(64),
                    涨跌额 VARCHAR(64),
                    成交量 VARCHAR(64),
                    成交额 VARCHAR(64),
                    最高 VARCHAR(64),
                    最低 VARCHAR(64),
                    今开 VARCHAR(64),
                    昨收 VARCHAR(64),
                    CONSTRAINT pk_money PRIMARY KEY (序号)
                )
            """)
        except Exception as err:
            print(err)

    def closeDB(self):
        if self.con:
            self.con.commit()
            self.con.close()

    def insert(self, data):
        try:
            self.cursor.execute("""
                INSERT INTO moneys (序号, 代码, 名称, 报价, 涨跌幅, 涨跌额, 成交量, 成交额, 最高, 最低, 今开, 昨收)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                data['f12'], data['f14'], data['f2'], data['f3'], data['f4'], data['f5'],
                data['f6'], data['f15'], data['f16'], data['f17'], data['f18']
            ))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("SELECT * FROM moneys")
        rows = self.cursor.fetchall()
        print("序号\t代码\t名称\t\t报价\t涨跌幅\t涨跌额\t成交量\t\t成交额\t\t最高\t最低\t今开\t昨收")
        for row in rows:
            print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s\t\t%s\t%s\t%s\t%s" % (
                row[0], row[1], row[2], row[3], row[4], row[5], row[6],
                row[7], row[8], row[9], row[10], row[11]
            ))


class MoneyData:
    def __init__(self, db_file):
        self.url = "http://14.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124015338467305145265_1696661176957&pn=1&pz=50&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=&fs=b:MK0010&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f11,f62,f128,f136,f115,f152&_=1696661176958"
        self.db = MoneyDB(db_file)
        self.db.openDB()

    def fetch_data(self):
        response = requests.get(url=self.url)
        response.encoding = "utf-8"  # 设置编码方式,否则有乱码
        data = json.loads(response.text[response.text.find('(') + 1:-2])['data']['diff']
        return data

    def process(self):
        data = self.fetch_data()
        for count, item in enumerate(data, start=1):
            self.db.insert(item)
            print("%d\t%s\t%s\t%s\t%s\t%s\t%s\t\t%s\t\t%s\t%s\t%s\t%s" % (
                count, item['f12'], item['f14'], item['f2'], item['f3'], item['f4'],
                item['f5'], item['f6'], item['f15'], item['f16'], item['f17'], item['f18']
            ))

    def close(self):
        self.db.closeDB()


money_data = MoneyData("moneys.db")
money_data.process()
money_data.close()

·运行结果

心得体会

通过本题的实践,对抓包的方式有了一定的了解与学习。

作业3

实验要求

·要求:爬取中国大学 2021 主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器 F12 调试分析的过程录制 Gif 加入至博客中。

·技巧:分析该网站的发包情况,分析获取数据的 api

·Gitee文件夹链接: [https://gitee.com/dong-qi168/sjcjproject/blob/master/作业2/大学排名]

实验内容

·代码展示

import requests
import re
import sqlite3

def isfloat(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

class RankDB:
    def __init__(self):
        self.conn = sqlite3.connect("ranking.db")
        self.cursor = self.conn.cursor()
        try:
            self.cursor.execute("CREATE TABLE IF NOT EXISTS ranks (name VARCHAR(16), scores VARCHAR(16))")
        except Exception as err:
            print(err)

    def __del__(self):
        self.conn.commit()
        self.conn.close()

    def insert(self, name, score):
        try:
            self.cursor.execute("INSERT INTO ranks (name, scores) VALUES (?, ?)", (name, score))
        except Exception as err:
            print(err)

resp = requests.get('https://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/202111/payload.js')
resp.encoding = 'utf-8'
res = resp.text

db = RankDB()

names = re.findall(r"univNameCn:(.*?),univNameEn:", res)
scores = re.findall(r"score:(.*?),ranking", res)

for i, name in enumerate(names):
    if not isfloat(scores[i]):
        scores[i] = ''
    print(i + 1, name, scores[i])
    db.insert(name, scores[i])

res = db.cursor.execute("SELECT * FROM ranks")
res = res.fetchall()
print(res)

·F12调试分析

·运行结果

·使用sqlite3查看保存的数据

心得体会

通过这次实践,对爬虫抓包有了更深的理解。