作业①:
-
要求:在中国气象网(http://www.weather.com.cn)给定城市集的7日天气预报,并保存在数据库。
-
文件夹链接:Gitee
代码和运行截图
from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import urllib.request
import sqlite3
# 天气数据库
class WeatherDB:
def __init__(self):
self.cursor = None
self.con = None
def openDB(self):
self.con = sqlite3.connect("weathers.db")
self.cursor = self.con.cursor()
try:
self.cursor.execute(
"create table weathers (wCity varchar(16),"
"wDate varchar(16),"
"wWeather varchar(64),"
"wTemp varchar(32),"
"constraint pk_weather primary key (wCity,wDate))")
except Exception as err:
print(err)
self.cursor.execute("delete from weathers")
def closeDB(self):
self.con.commit()
self.con.close()
def insert(self, city, date, weather, temp):
try:
self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
(city, date, weather, temp))
except Exception as err:
print(err)
def show(self):
self.cursor.execute("select * from weathers")
rows = self.cursor.fetchall()
print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
for row in rows:
print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))
# 天气预报
class WeatherForecast:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) "
"Gecko/2008072421 Minefield/3.0.2pre"}
self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}
def forecastCity(self, city):
if city not in self.cityCode.keys():
print(city + " 找不到代码")
return
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req)
data = data.read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
data = dammit.unicode_markup
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
x = 0
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
if x == 0: # 为今天只有一个温度做判断 <i>14℃</i>
x += 1
temp = li.select('p[class="tem"] i')[0].text
else:
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
print(city, date, weather, temp)
self.db.insert(city, date, weather, temp)
except Exception as err:
print(err)
except Exception as err:
print(err)
def process(self, cities):
self.db = WeatherDB()
self.db.openDB()
for city in cities:
self.forecastCity(city)
# self.db.show()
self.db.closeDB()
ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")
个人心得
感觉在一开始比较不懂的就是保存在数据库了,但是了解一下很快就解决了,题目也是之间做过的,没特别难的地方
作业②:
-
要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。
-
文件夹链接:Gitee
代码以及运行截图
'''
数据来源:东方财富网-行情中心
http://quote.eastmoney.com/center
'''
import requests
import re
import pandas as pd
import sqlite3
def create_db(conn, table_name):
# 创建表格的 SQL 语句
create_table_sql = '''
CREATE TABLE IF NOT EXISTS {} (
序号 INTEGER PRIMARY KEY,
代码 TEXT,
名称 TEXT,
最新价 TEXT,
涨跌幅 TEXT,
涨跌额 TEXT,
成交量 TEXT,
成交额 TEXT,
振幅 TEXT
);
'''.format(table_name)
# 执行 SQL 语句
conn.execute(create_table_sql)
def insert_data(conn, table_name, data):
# 插入数据的 SQL 语句
insert_data_sql = '''
INSERT INTO {} (代码, 名称, 最新价, 涨跌幅, 涨跌额, 成交量, 成交额, 振幅)
VALUES (?, ?, ?, ?, ?, ?, ?, ?);
'''.format(table_name)
# 执行 SQL 语句
conn.execute(insert_data_sql, (
data['代码'], data['名称'], data['最新价'],
data['涨跌幅'], data['涨跌额'], data['成交量'],
data['成交额'], data['振幅']
))
#用get方法访问服务器并提取页面数据
def getHtml(cmd):
url = "http://8.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409706851180828395_1696658785167&pn=1&pz=20&po=1&np=2&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=" + cmd + "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696658785168"
r = requests.get(url)
return r.text
#获取页面股票数据
def get_stock_data(text):
com = re.compile(r'"f2":(?P<end>.+?),.*?"f6":(?P<volume>.+?),.*?"f3":(?P<up_dowm>.+?),.?"f4":(?P<up_dowm_price>.+?),.?"f5":(?P<end_good>.+?),.?"f7":(?P<zhangfu>.+?),.?"f12":"(?P<number>.+?)",.*?"f14":"(?P<name>.+?)"'
',.*?"f15":(?P<max>.+?),.*?"f16":(?P<min>.+?),.*?"f17":(?P<start>.+?),', re.S)
ret = com.finditer(text)
count = 1
for i in ret:
yield {
'序号': count,
'代码': i.group('number'),
'名称': i.group('name'),
'最新价': i.group('end'),
'涨跌幅': i.group('up_dowm'),
'涨跌额': i.group('up_dowm_price'),
'成交量': i.group('end_good'),
'成交额': i.group('volume'),
'振幅': i.group('zhangfu'),
}
count += 1
def main():
cmd = {
"深沪京A股": "m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",
"上证A股": "m:1+t:2,m:1+t:23",
"深圳A股": "m:0+t:6,m:0+t:80",
}
# 遍历cmd字典,每个键值对对应一个表格
for table_name, cmd_value in cmd.items():
stocks_data = getHtml(cmd_value)
stocks = get_stock_data(stocks_data)
# 连接或创建数据库
conn = sqlite3.connect(table_name + '.db')
# 创建表格
create_db(conn, table_name)
# 插入数据
for stock in stocks:
insert_data(conn, table_name, stock)
# 提交更改并关闭连接
conn.commit()
conn.close()
print(f"{table_name}.db 数据库保存完毕!")
if __name__ == '__main__':
main()
再写一个保存在csv文件方便直接查看
心得:
在对json文件的网址分析时,发现有个参数“np”,当把他后面对应的数字改成2时,会直接输出从第一页到最后一页的数据,直接省去了翻页的步骤
作业③:
- 作业要求:要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
- 仓库地址:Gitee
代码和运行截图
import requests
import re
import pandas as pd
from fake_useragent import UserAgent
# 创建一个随机User-Agent生成器
user_agent = UserAgent()
# 设置请求头
headers = {
'User-Agent': user_agent.random,
}
url = "https://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/2021/payload.js"
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
content = response.text
# 使用正则表达式捕获组提取信息
pattern = r'univNameCn:"(.*?)".*?score:(.*?),.*?province:(.*?),'
data = re.findall(pattern, content)
# 创建 DataFrame
df = pd.DataFrame(data, columns=["学校", "分数", "省份"])
# 保存为 CSV 文件
df.to_csv('rankings.csv', index=False, encoding='utf-8-sig')
直接保存为csv文件方便查看
GIF图片:
心得:
个人觉得主要的难点就是json文件的提取和正则表达式匹配了,其他的都是比较基础的内容