数据采集实践作业2

作业一

要求：在中国气象网（http://www.weather.com.cn）给定城市集的7日天气预报，并保存在数据库。

代码和运行截图如下:

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit

import urllib.request
import sqlite3
class WeatherDB:
    def openDB(self):
        self.con=sqlite3.connect("weathers.db")
        self.cursor=self.con.cursor()
        try:
            self.cursor.execute("create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))")
        except:
            self.cursor.execute("delete from weathers")

    def closeDB(self):
            self.con.commit()
            self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                    (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
            self.cursor.execute("select * from weathers")
            rows = self.cursor.fetchall()
            print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
            for row in rows:
                print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))

class WeatherForecast:
    def __init__(self):
        self.headers = {
                "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return


        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()


        for city in cities:
            self.forecastCity(city)


        # self.db.show()
        self.db.closeDB()

ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")

心得体会

通过这次爬取对beautifulsoup的使用更加熟练

Gitee文件夹链接:第二次数据采集实践代码 · 47a751c · 林梓源/数据采集与融合技术 - Gitee.com

作业二

要求：用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息，并存储在数据库中。

代码和运行截图如下:

import requests
import json
headers = {
    'User-Agent':
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36 Edg/117.0.2045.60'
}
url = "http://82.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408099604168964996_1697558880890&pn=1&pz=50&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=&fs=b:MK0010&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f26,f22,f11,f62,f128,f136,f115,f152&_=1697558880904"
response = requests.get(url=url,headers=headers)
paper = response.text
d = json.loads(response.text[response.text.find('(')+1:-2])['data']['diff']
print('序号\t代码\t名称\t报价\t涨跌幅\t涨跌额\t成交量\t成交额\t最高\t最低\t今开\t昨收')
count = 1
for i in d:
    print(count,end='\t')
    print(i['f12'],end='\t')
    print(i['f14'],end='\t')
    print(i['f2'],end='\t')
    print(i['f3'],end='\t')
    print(i['f4'],end='\t')
    print(i['f5'],end='\t')
    print(i['f6'],end='\t')
    print(i['f15'],end='\t')
    print(i['f16'],end='\t')
    print(i['f17'],end='\t')
    print(i['f18'])
    count=count+1

心得体会

对requests和json更加了解

Gitee文件夹链接:第二次数据采集实践代码 · 47a751c · 林梓源/数据采集与融合技术 - Gitee.com

作业三

要求：爬取中国大学2021主榜（https://www.shanghairanking.cn/rankings/bcur/2021）所有院校信息，并存储在数据库中，同时将浏览器F12调试分析的过程录制Gif加入至博客中。

代码与运行截图如下:

import requests
import re
import sqlite3

headers = {
    'User-Agent':
'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Mobile Safari/537.36 Edg/117.0.2045.60'
}
#创建数据库
conn = sqlite3.connect('univer.db')
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS univer (  
                    id VARCHAR(16),  
                    name VARCHAR(16),
                    score   VARCHAR(16)
                )''')

url = 'https://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/2021/payload.js'
response = requests.get(url=url,headers=headers)
text = response.text
names = re.findall("univNameCn:(.*?),univNameEn:",text)
scores = re.findall("score:(.*?),",text)
print("{:<5}{:<15}{:<7}".format("排名","学校","总分"))
rank = 0
for i,j in zip(names,scores):
    rank=rank+1
    print("{:<5}{:<15}{:<7}".format(rank,i,j))
    row = (rank,i,j)
    #插入
    cursor.execute('INSERT INTO univer VALUES (?, ?, ?)', row)
#确认
conn.commit()
#关闭
conn.close()