2023数据采集与融合技术实践作业二

发布时间 2023-10-19 02:38:23作者: showha

本次作业gitee仓库地址

作业①:

* 要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。

代码

from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import urllib.request
import sqlite3


# 天气数据库
class WeatherDB:
    def __init__(self):
        self.cursor = None
        self.con = None

    def openDB(self):
        self.con = sqlite3.connect("./2/weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute(
                "create table weathers (wCity varchar(16),"
                "wDate varchar(16),"
                "wWeather varchar(64),"
                "wTemp varchar(32),"
                "constraint pk_weather primary key (wCity,wDate))")
        except Exception as err:
            print(err)
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


# 天气预报
class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) "
                          "Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " 找不到代码")
            return
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            x = 0
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    if x == 0:  # 为今天只有一个温度做判断 <i>14℃</i>
                        x += 1
                        temp = li.select('p[class="tem"] i')[0].text
                    else:
                        temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()

        for city in cities:
            self.forecastCity(city)

        # self.db.show()
        self.db.closeDB()


ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")

运行结果

心得体会

学习了python与数据库的联系,能够将爬取的数据存储到数据库中,现在已能熟练掌握bs4库的使用,以及CSS语法选择器

作业②:

* 要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。

代码

# import sqlite3
import requests
from bs4 import BeautifulSoup as BS
import re
import json
import pandas as pd

url = "https://57.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409027887847400018_1696658597024&pn=1&pz=20&po=1&np=2&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs={}&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f33,f11,f62,f128,f136,f115,f152&_=1696658597025"


keys = ['f12', 'f14', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f15', 'f16', 'f17', 'f18', 'f10', 'f8', 'f9', 'f23']
Fkeys = {
    'f12': '代码',
    'f14': '名称',
    'f2': '最新价格',
    'f3': '涨跌额',
    'f4': '涨跌幅',
    'f5': '成交量',
    'f6': '成交额',
    'f7': '振幅',
    'f15': '最高',
    'f16': '最低',
    'f17': '今开',
    'f18': '昨收',
    'f10': '量比',
    'f8': '换手率',
    'f9': '市盈率',
    'f23': '市净率'
}

def getHTML(cmd):
    url = "https://57.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409027887847400018_1696658597024&pn=1&pz=20&po=1&np=2&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs={}&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f33,f11,f62,f128,f136,f115,f152&_=1696658597025"
    url_r = url.format(cmd)
    r = requests.get(url_r)
    reg = r'{.*}'
    r.encoding = r.apparent_encoding
    data = re.compile(reg, re.S).findall(r.text)
    return data[0]

def getStockData(data):
    jsData = json.loads(data)
    return jsData['data']['diff']

def main():
    cmd = {
        "沪深京A股":"m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",
        "上证A股":"m:1+t:2,m:1+t:23",
        "深圳A股":"m:0+t:6,m:0+t:80",
        "创业板":"m:0+t:80"
    }
        # "上证指数":"C.1",
        # "深圳指数":"C.5",
        # "沪深A股":"C._A",
        # "新股":"C.BK05011",
        # "中小板":"C.13",

    for i in cmd.keys():
        stocks = getStockData(getHTML(cmd[i]))
        with open("2/stocks/"+i+".csv", "w", encoding="gbk") as f:
            for j in keys:
                f.write(Fkeys[j]+",")
            
            f.write("\n")
            for j in stocks:
                j = stocks[j]
                print(j)
                for k in keys:
                    f.write(str(j[k]) + ",")
                f.write("\n")
        print("已保存"+i+".xlsx")

if __name__ == "__main__":
    main()

运行结果


心得体会

在老师的指导下,学习了基于JS抓包的方法提取数据,而非直接从网页源码中爬取数据,值得注意的是,如果保存为csv格式,那么win下默认使用Excel打开,此时Excel的默认编码为GBK,会乱码

作业③:

* 要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。

调试分析过程

代码

import requests
import pandas as pd
import re

url = "https://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/2021/payload.js"

resquest = requests.get(url=url)
#获取学校名称
name_grep = ',univNameCn:"(.*?)",'
name = re.findall(name_grep,resquest.text)
#获取学校总分
score_grep = ',score:(.*?),'
score = re.findall(score_grep,resquest.text)
#获取学校类型
category_grep = ',univCategory:(.*?),'
category = re.findall(category_grep,resquest.text)
#获取学校所在省份
province_grep = ',province:(.*?),'
province = re.findall(province_grep,resquest.text)

code_name_grep = 'function(.*?){'
code_name = re.findall(code_name_grep,resquest.text)
start_code = code_name[0].find('a')
end_code = code_name[0].find('pE')
code_name = code_name[0][start_code:end_code].split(',')#将function中的参数取出并存在code_name列表中

# value_name_grep ='mutations:(.*?);'
value_name_grep = r'\}\((.*?)\)\)\);'
# print(resquest.text)
value_name = re.search(value_name_grep,resquest.text)
# print(value_name.groups())
value_name = value_name.groups()[0].split(",")

df = pd.DataFrame(columns=["排名","学校","省份","类型","总分"])
for i in range(len(name)):
    province_name = value_name[code_name.index(province[i])][1:-1]
    category_name = value_name[code_name.index(category[i])][1:-1]
    real_score = score[i]
    if re.search(r"[0-9.]+",score[i]) is None:
        real_score = value_name[code_name.index(score[i])+3]
    df.loc[i] = [i+1,name[i],province_name,category_name,real_score]
print(df)
df.to_excel("./2/rank.xlsx")

运行截图

心得体会

基于jS抓包的方式,其难点在于网页中JS函数的参数与源数据难以对应,网页js函数调用的参数十分抽象(估计这也是反爬?)。一直在修改正则表达式,一直在寻找总分总分代号之间的联系(太抽象了)