2023数据采集实践二

发布时间 2023-10-19 01:48:01作者: Scholaar

作业①:

代码和运行截图

from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import urllib.request
import sqlite3


# 天气数据库
class WeatherDB:
    def __init__(self):
        self.cursor = None
        self.con = None

    def openDB(self):
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute(
                "create table weathers (wCity varchar(16),"
                "wDate varchar(16),"
                "wWeather varchar(64),"
                "wTemp varchar(32),"
                "constraint pk_weather primary key (wCity,wDate))")
        except Exception as err:
            print(err)
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


# 天气预报
class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) "
                          "Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " 找不到代码")
            return
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            x = 0
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    if x == 0:  # 为今天只有一个温度做判断 <i>14℃</i>
                        x += 1
                        temp = li.select('p[class="tem"] i')[0].text
                    else:
                        temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()

        for city in cities:
            self.forecastCity(city)

        # self.db.show()
        self.db.closeDB()


ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")


个人心得

感觉在一开始比较不懂的就是保存在数据库了,但是了解一下很快就解决了,题目也是之间做过的,没特别难的地方

作业②:

  • 要求:用 requests 和 BeautifulSoup 库方法定向爬取股票相关信息,并存储在数据库中。

  • 文件夹链接:Gitee

代码以及运行截图

'''
数据来源:东方财富网-行情中心
http://quote.eastmoney.com/center
'''

import requests
import re
import pandas as pd
import sqlite3

def create_db(conn, table_name):
    # 创建表格的 SQL 语句
    create_table_sql = '''
    CREATE TABLE IF NOT EXISTS {} (
        序号 INTEGER PRIMARY KEY,
        代码 TEXT,
        名称 TEXT,
        最新价 TEXT,
        涨跌幅 TEXT,
        涨跌额 TEXT,
        成交量 TEXT,
        成交额 TEXT,
        振幅 TEXT
    );
    '''.format(table_name)

    # 执行 SQL 语句
    conn.execute(create_table_sql)
    
def insert_data(conn, table_name, data):
    # 插入数据的 SQL 语句
    insert_data_sql = '''
    INSERT INTO {} (代码, 名称, 最新价, 涨跌幅, 涨跌额, 成交量, 成交额, 振幅)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?);
    '''.format(table_name)

    # 执行 SQL 语句
    conn.execute(insert_data_sql, (
        data['代码'], data['名称'], data['最新价'],
        data['涨跌幅'], data['涨跌额'], data['成交量'],
        data['成交额'], data['振幅']
    ))

#用get方法访问服务器并提取页面数据
def getHtml(cmd):
    url = "http://8.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409706851180828395_1696658785167&pn=1&pz=20&po=1&np=2&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=" + cmd + "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696658785168"
    r = requests.get(url)
    return r.text

#获取页面股票数据
def get_stock_data(text):
    com = re.compile(r'"f2":(?P<end>.+?),.*?"f6":(?P<volume>.+?),.*?"f3":(?P<up_dowm>.+?),.?"f4":(?P<up_dowm_price>.+?),.?"f5":(?P<end_good>.+?),.?"f7":(?P<zhangfu>.+?),.?"f12":"(?P<number>.+?)",.*?"f14":"(?P<name>.+?)"'
                     ',.*?"f15":(?P<max>.+?),.*?"f16":(?P<min>.+?),.*?"f17":(?P<start>.+?),', re.S)
    ret = com.finditer(text)
    count = 1
    for i in ret:
        yield {
            '序号': count,
            '代码': i.group('number'),
            '名称': i.group('name'),
            '最新价': i.group('end'),
            '涨跌幅': i.group('up_dowm'),
            '涨跌额': i.group('up_dowm_price'),
            '成交量': i.group('end_good'),
            '成交额': i.group('volume'),
            '振幅': i.group('zhangfu'),
        }
        count += 1

def main():
    cmd = {
        "深沪京A股": "m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048",
        "上证A股": "m:1+t:2,m:1+t:23",
        "深圳A股": "m:0+t:6,m:0+t:80",
    }
    # 遍历cmd字典,每个键值对对应一个表格
    for table_name, cmd_value in cmd.items():
        stocks_data = getHtml(cmd_value)
        stocks = get_stock_data(stocks_data)

        # 连接或创建数据库
        conn = sqlite3.connect(table_name + '.db')

        # 创建表格
        create_db(conn, table_name)

        # 插入数据
        for stock in stocks:
            insert_data(conn, table_name, stock)

        # 提交更改并关闭连接
        conn.commit()
        conn.close()
        print(f"{table_name}.db 数据库保存完毕!")

if __name__ == '__main__':
    main()


再写一个保存在csv文件方便直接查看

心得:

在对json文件的网址分析时,发现有个参数“np”,当把他后面对应的数字改成2时,会直接输出从第一页到最后一页的数据,直接省去了翻页的步骤

作业③:

代码和运行截图

import requests
import re
import pandas as pd
from fake_useragent import UserAgent

# 创建一个随机User-Agent生成器
user_agent = UserAgent()
# 设置请求头
headers = {
    'User-Agent': user_agent.random,
}

url = "https://www.shanghairanking.cn/_nuxt/static/1697106492/rankings/bcur/2021/payload.js"

response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
content = response.text

# 使用正则表达式捕获组提取信息
pattern = r'univNameCn:"(.*?)".*?score:(.*?),.*?province:(.*?),'
data = re.findall(pattern, content)

# 创建 DataFrame
df = pd.DataFrame(data, columns=["学校", "分数", "省份"])

# 保存为 CSV 文件
df.to_csv('rankings.csv', index=False, encoding='utf-8-sig')

直接保存为csv文件方便查看

GIF图片:

心得:

个人觉得主要的难点就是json文件的提取和正则表达式匹配了,其他的都是比较基础的内容