2023数据采集与融合技术实践作业二

发布时间 2023-10-17 23:29:03作者: 羊耶飞舞

第二次作业

作业1:

要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。

输出信息:(

)

Gitee文件夹链接:https://gitee.com/hong-songyu/crawl_project/tree/master/作业2/2.1

代码:

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import  sqlite3
class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return

        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "html.parser")
            lis = soup.select("ul[class='t clearfix'] li")

            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()

        for city in cities:
            self.forecastCity(city)

        print("------------输出数据库结果------------")
        self.db.show()
        self.db.closeDB()


class WeatherDB:
    def openDB(self):
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute(
                "create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))

if __name__ == "__main__":
    ws = WeatherForecast()
    ws.process(["北京", "上海", "广州", "深圳"])
    print("completed")

心得体会:

这次爬取天气的实践,不仅让我巩固了bs4库的使用,加深了理解,更初次用sqlite3将爬取数据存储到数据库中。

作业2:

要求:用requests和BeautifulSoup库方法定向爬取股票相关信息,并存储在数据库中。

输出信息:(

)
)

Gitee文件夹链接:https://gitee.com/hong-songyu/crawl_project/tree/master/作业2/2.2

代码:

import sqlite3
import requests
import re

def getHtml(url):
    try:
        header = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        res = requests.get(url,headers=header)
        html = res.text
        return html
    except Exception as err:
        print(err)

num = 1
def getContent(html):
    stocks = re.findall(r"\"diff\":\[(.*?)\]",html,re.M|re.S)
    stocks = list(eval(stocks[0]))
    global num
    result = []
    for stock in stocks:
        stockcode = stock["f12"]
        stockname = stock["f14"]
        newprice = stock["f2"]
        diefu = stock["f3"]
        diee = stock["f4"]
        dealnum = stock["f5"]
        deale = stock["f6"]
        zhenfu = stock["f7"]
        most = stock["f15"]
        least = stock["f16"]
        today = stock["f17"]
        yesterday = stock["f18"]
        result.append([num,stockcode,stockname,newprice,diefu,diee,dealnum,deale,zhenfu,most,least,today,yesterday])
        num += 1

    return result
class stockDB:
    def openDB(self):
        self.con = sqlite3.connect("stocks.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("create table stocks (Num varchar(16), stockCode varchar(16),stockName varchar(16),Newprice varchar(16),RiseFallpercent varchar(16),RiseFall varchar(16),Turnover varchar(16),Dealnum varchar(16),Amplitude varchar(16),max varchar(16),min varchar(16),today varchar(16),yesterday varchar(16))")
        except:
            self.cursor.execute("delete from stocks")
    def closeDB(self):
        self.con.commit()
        self.con.close()
    def insert(self,Num,stockcode,stockname,newprice,risefallpercent,risefall,turnover,dealnum,Amplitude,max,min,today,yesterday):
        try:
            self.cursor.execute("insert into     
      stocks(Num,stockCode,stockName,Newprice,RiseFallpercent,RiseFall,Turnover,Dealnum,Amplitude,max,min,today,yesterday) values (?,?,?,?,?,?,?,?,?,?,?,?,?)",
                                (Num,stockcode,stockname,newprice,risefallpercent,risefall,turnover,dealnum,Amplitude,max,min,today,yesterday))
        except Exception as err:
            print(err)
def main():
    s = "{0:^10}\t{1:{13}^10}\t{2:{13}^10}\t{3:{13}^10}\t{4:{13}^10}\t{5:{13}^10}\t{6:{13}^10}\t{7:{13}^10}\t{8:{13}^10}\t{9:{13}^10}\t{10:{13}^10}\t{11:{13}^10}\t{12:{13}^10}"
    print(s.format("序号", "股票代码", "股票名称", "最新价", "涨跌幅", "涨跌额", "成交量", "成交额", "振幅", "最高",
                   "最低", "今收", "昨收", chr(12288)))
    stockdb = stockDB()
    stockdb.openDB()
    for page in range(1, 6):
        url = "http://19.push2.eastmoney.com/api/qt/clist/get?cb=jQuery11240009917002240502182_1634088844934&pn=" + str(
         page) + "&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:1+t:2,m:1+t:23&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1634088845178"
        html = getHtml(url)
        stocks = getContent(html)
        for stock in stocks:
            print(s.format(stock[0], stock[1], stock[2], stock[3], stock[4], stock[5], stock[6], stock[7], stock[8],
                           stock[9], stock[10], stock[11], stock[12], chr(12288)))
            stockdb.insert(stock[0], stock[1], stock[2], stock[3], stock[4], stock[5], stock[6], stock[7], stock[8],
                           stock[9], stock[10], stock[11], stock[12])
    stockdb.closeDB()
if __name__ == "__main__":
    main()

心得体会:

第一次尝试抓包,初步熟悉了json格式的信息提取的方法,同时再加深对sqlite3库的使用。

作业3:

要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。

输出信息:(

)

Gitee文件夹链接:https://gitee.com/hong-songyu/crawl_project/tree/master/作业2/2.3

代码:

import requests
import re
import sqlite3

header = {
    "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
}
url = 'https://www.shanghairanking.cn/_nuxt/static/1695811954/rankings/bcur/2021/payload.js'
r = requests.get(url=url,headers=header)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
name = 'univNameCn:"(.*?)"'
score = 'score:(.*?),'
namelist = re.findall(name, html, re.S | re.M)
scorelist = re.findall(score, html, re.S | re.M)
print("排名\t学校名称\t总分")
num = 1
for i in range(len(namelist)):
    print(num,namelist[i],scorelist[i])
    num += 1

conn = sqlite3.connect('universities.db')
conn.execute('''CREATE TABLE IF NOT EXISTS universities
                (id INTEGER PRIMARY KEY AUTOINCREMENT,
                name TEXT,
                score INTEGER)''')
for i in range(len(namelist)):
    conn.execute("INSERT INTO universities (name, score) VALUES (?, ?)", (namelist[i], scorelist[i]))
conn.commit()
conn.close()

心得体会:

更熟悉对网页的抓包的过程,正则表达式的运用