2023数据采集实践作业二

发布时间 2023-10-19 02:17:49作者: 风宝风宝世界最好

作业1:
要求:在中国气象网(http://www.weather.com.cn)给定城市集的 7日天气预报,并保存在数据库。

点击查看代码
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3


class DB:
    def open(self):
        self.con = sqlite3.connect("weathers.db")
        self.cur = self.con.cursor()
        try:
            self.cur.execute("create table weathers (wCity varchar(16), wDate varchar(16), wWeather varchar(64), "
                                "wTemp varchar(32), constraint pk_weather primary key(wCity, wDate))")
        except:
            self.cur.execute("delete from weathers")

    def close(self):
        self.con.commit()
        self.con.close()


    def insert(self, city, date, weather, temp):
        try:
            self.cur.execute("insert into weathers (wCity, wDate, wWeather, wTemp)values(?, ?, ?, ?)",
                             (city, date, weather, temp))
        except Exception as err:
            print(err)

    def appear(self):
        self.cur.execute("select * form weathers")
        a = self.cur.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for b in a:
            print("%-16s%-16s%-32s%-16s" % (b[0], b[1], b[2], b[3]))


class weather:
    def __init__(self):
        self.headers = {"User-Agent": "Mozilla/5.0"}
        self.list = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def city(self, city):
        if city not in self.list.keys():
            print(city + "无数据 ")
            return

        url = "http://www.weather.com.cn/weather/" + self.list[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = DB()
        self.db.open()

        for city in cities:
            self.city(city)

        # self.db.show()
        self.db.close()


result = weather()
result.process(["北京", "上海", "广州", "深圳"])


运行结果截图:

作业2:
代码

点击查看代码
import requests,re
import json

# 计算有多少个中文字符,输出格式用到
def count(s):
    return len([ch for ch in s if '\u4e00' <= ch <= '\u9fff'])

cnt = 3  # 爬取页面数,最多210
# 输出表头
print("{:<3} {:<5} {:<6} {:<4} {:<5} {:<5} {:<8} {:<9} {:<4} {:<5} {:<4} {:<5} {:<5}".format(
    "序号", "股票代码", "股票名称", "最新报价", "涨跌幅", "涨跌额", "成交量", "成交额", "振幅", "最高", "最低", "今开", "昨收"))
for i in range(cnt):
    # 只对沪深A股进行爬取,利用参数pn实现翻页
    url = "http://41.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408942134990117723_1601810673881&pn=" + \
        str(i+1)+"&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f3&fs=m:0+t:6,m:0+t:13,m:0+t:80,m:1+t:2,m:1+t:23"
    r = requests.get(url)
    r.encodingn = "UTF-8"
    # 找到第一个[的位置,因为前面有一个{,会对匹配造成影响
    text = r.text[r.text.index("["):]
    # 匹配所有{},每支股票信息都包含在一个{}里
    datas = re.findall("{.*?}", text)
    for j in range(len(datas)):
        # 将数据解析成字典
        data = json.loads(datas[j])
        # 输出数据
        temp = "{:<5} {:<8} {:<"+str(10-count(
            data['f14']))+"} {:<7} {:<7} {:<7} {:<8} {:<13} {:<6} {:<6} {:<6} {:<6} {:<6}"
        print(temp.format(i*20+j+1, data['f12'], data['f14'], data['f2'], data['f3'], data['f4'],
                          data['f5'], data['f6'], data['f7'], data['f15'], data['f16'], data['f17'], data['f18']))
运行结果截图: ![](https://img2023.cnblogs.com/blog/3286132/202310/3286132-20231019015031437-535927076.png)

作业三:
要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。


代码:

点击查看代码
代码:
import requests
import re
import sqlite3
headers = {
    'User-Agent':
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36 Edg/117.0.2045.60'
}
conn = sqlite3.connect('univer.db')
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS univer (  
                    id VARCHAR(16),
                    name VARCHAR(16),
                    score   VARCHAR(16)
                )''')
url = 'https://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/2021/payload.js'
response = requests.get(url=url,headers=headers)
text = response.text
pattern = r'univNameCn:(.*?),univNameEn:.*?score:(.*?),'
results = re.findall(pattern,text)
print("{:<5}{:<15}{:<7}".format("排名","学校","总分"))
for i, result in enumerate(results):
    rank = i + 1
    name, score = result
    name = name.strip('""')
    score = score.strip('""')
    print("{:<5}{:<15}{:<7}".format(rank, name, score))
    row = (rank,name,score)
    cursor.execute('INSERT INTO univer VALUES (?, ?, ?)', row)
conn.commit()
conn.close()

运行结果截图: