作业一:要求:在中国气象网(http://www.weather.com.cn)给定城市集合的7日天气预报,并保存在数据库。
Gitee文件夹链接:https://gitee.com/guo-hengxin/102102150/tree/master/%E7%AC%AC%E4%BA%8C%E6%AC%A1%E4%BD%9C%E4%B8%9A
代码:
1 from bs4 import BeautifulSoup 2 from bs4 import UnicodeDammit 3 import urllib.request 4 import sqlite3 5 6 # 创建 SQLite 数据库连接 7 conn = sqlite3.connect('weather_data.db') 8 cursor = conn.cursor() 9 10 # 创建一个表用于存储天气数据 11 cursor.execute(''' 12 CREATE TABLE IF NOT EXISTS WeatherData ( 13 id INTEGER PRIMARY KEY, 14 area TEXT, 15 date TEXT, 16 weather TEXT, 17 temperature TEXT 18 ) 19 ''') 20 21 url = "http://www.weather.com.cn/weather/101010100.shtml" 22 try: 23 headers = { 24 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"} 25 req = urllib.request.Request(url, headers=headers) 26 data = urllib.request.urlopen(req) 27 data = data.read() 28 dammit = UnicodeDammit(data, ["utf-8", "gbk"]) 29 data = dammit.unicode_markup 30 soup = BeautifulSoup(data, "lxml") 31 lis = soup.select("ul[class='t clearfix'] li") 32 i = 0 33 print("{:<10}\t{:<10}\t{:<10}\t\t{:<10}{:<10}".format("id","地区","日期","天气信息","温度")) 34 for li in lis: 35 try: 36 i = i + 1 37 date = li.select('h1')[0].text 38 weather = li.select('p[class="wea"]')[0].text 39 temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text 40 # 将数据插入到数据库中 41 cursor.execute(''' 42 INSERT INTO WeatherData (area, date, weather, temperature) VALUES (?, ?, ?, ?) 43 ''', ("北京", date, weather, temp)) 44 conn.commit() # 提交事务 45 46 print("{:<10}\t{:<10}\t{:<10}\t{:<10}\t{:<10}".format(i, "北京", date, weather, temp)) 47 except Exception as err: 48 print(err) 49 except Exception as err: 50 print(err) 51 finally: 52 # 关闭数据库连接 53 conn.close() 54 55 56 conn = sqlite3.connect('weather_data.db') 57 c = conn.cursor() 58 59 60 for row in c.execute('SELECT * FROM WeatherData'): 61 print(row) 62 63 64 conn.close()
运行结果:
心得体会:通过这次爬取对beautifuisoup的使用更加熟练
作业二:要求:用requests和自选提取信息方法定向爬取股票相关信息,并存储在数据库中。
候选网站:东方财富网:https://www.eastmoney.com/新浪股票:http://finance.sina.com.cn/stock/
Gitee文件夹链接:https://gitee.com/guo-hengxin/102102150/tree/master/%E7%AC%AC%E4%BA%8C%E6%AC%A1%E4%BD%9C%E4%B8%9A
代码:
1 import requests 2 import json 3 import sqlite3 4 5 # 创建一个SQLite数据库连接 6 conn = sqlite3.connect('my_database.db') 7 8 cursor = conn.cursor() 9 10 11 # 定义数据库表结构 12 cursor.execute('''CREATE TABLE IF NOT EXISTS my_table ( 13 id INTEGER PRIMARY KEY, 14 code TEXT, 15 name TEXT, 16 latest_price REAL, 17 change_percent REAL, 18 change_amount REAL, 19 volume INTEGER, 20 turnover REAL, 21 amplitude REAL, 22 high REAL, 23 low REAL, 24 opening_price REAL, 25 yesterday_close REAL, 26 volume_ratio REAL, 27 turnover_rate REAL, 28 pe_ratio REAL, 29 pb_ratio REAL 30 )''') 31 32 headers = { 33 'user-agent' :'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47', 34 'cookie' : 'qgqp_b_id=a7c5d47be8ad882fee56fc695bab498d; st_si=17803153105309; st_asi=delete; HAList=ty-0-300045-%u534E%u529B%u521B%u901A; st_pvi=56620321867111; st_sp=2023-10-07%2015%3A19%3A51; st_inirUrl=https%3A%2F%2Fwww.eastmoney.com%2F; st_sn=52; st_psi=20231007155656228-113200301321-9129123788' 35 } 36 37 keypage = input("请输入要搜索的特定页面(用空格分隔):") 38 searchlist = list(map(int, keypage.split())) 39 40 for page in searchlist: 41 response = requests.get(url=f'http://76.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112405166990298085778_1696666115151&pn={page}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696666115152',headers=headers) 42 data = response.text 43 start = response.text.find('(') 44 end = response.text.rfind(')') 45 data = response.text[start + 1:end] 46 data = json.loads(data) 47 data = data['data']['diff'] 48 49 plist = ['f12','f14','f2','f3','f4','f5','f6','f7','f15','f16','f17','f18','f10','f8','f9','f23'] 50 for i in range(len(data)): 51 row = (i + 1,) + tuple(data[i][j] for j in plist) 52 cursor.execute('INSERT INTO my_table VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?)', row) 53 conn.commit() # 提交更改并关闭连接 54 conn.close() 55 56 print("{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}{:<10}".format("序号","代码","名称","最新价","涨跌幅","涨跌额","成交量","成交额","振幅","最高","最低","今开","昨收","量比","换手率","市盈率","市净率")) 57 for i in range(len(data)): 58 slist = [] 59 print("{:<10}".format(i+1),end="") 60 for j in plist: 61 62 slist.append(data[i][j]) 63 for k in slist: 64 print("{:<10}".format(k),end="") 65 print()
运行结果:
心得体会:我更加熟悉了对json格式的信息提取的方法以及对轻量数据库sqlite3的使用
作业三:要求:爬取中国大学2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校信息,并存储在数据库中,同时将浏览器F12调试分析的过程录制Gif加入至博客中。
Gitee文件夹链接:https://gitee.com/guo-hengxin/102102150/tree/master/%E7%AC%AC%E4%BA%8C%E6%AC%A1%E4%BD%9C%E4%B8%9A
调试分析的过程:
代码:
import requests import re import sqlite3 headers = { 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.46' } conn = sqlite3.connect('daxue.db') cursor = conn.cursor() cursor.execute('''CREATE TABLE IF NOT EXISTS daxue ( id INTEGER PRIMARY KEY, name text, score text )''') url = 'https://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/2021/payload.js' response = requests.get(url=url,headers=headers) data = response.text namelist = re.findall("univNameCn:(.*?),univNameEn:",data) scorelist = re.findall("score:(.*?),",data) print("{:<10}{:<10}{:<10}".format("排名","学校","总分")) x = 0 for i,j in zip(namelist,scorelist): x=x+1 print("{:<10}{:<10}{:<10}".format(x,i,j)) row = (x,i,j) cursor.execute('INSERT INTO daxue VALUES (?, ?, ?)', row) conn.commit() # 提交更改并关闭连接 conn.close()
运行结果:
心得体会:我更加熟悉re库的使用以及对网页的抓包