爬虫作业

发布时间 2023-12-29 00:32:18作者: 廖晟崴

搜狗网页

import requests

url = "https://www.sogou.com"

for _ in range(20):
    response = requests.get(url)
    print(f"返回状态:{response.status_code}")
    text_length = len(response.text)
    content_length = len(response.content)
    print(f"text()内容:{response.text}")
    print(f"text()属性返回网页内容长度:{text_length} 字符")
    print(f"content属性返回网页内容长度:{content_length} 字节")
    print("-" * 30)

运行结果

 

 

 Bs4的使用

from bs4 import BeautifulSoup
import re


text = """
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>菜鸟教程(runoob.com)</title>
</head>
<body>
<h1>我的第一个标题</h1>
<p id="first">我的第一个段落。</p>
</body>
<table border="1">
<tr>
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr>
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table>
</html>
"""
# 创建BeautifulSoup对象
soup = BeautifulSoup(text, features="html.parser")

# 打印head标签和学号后两位
print(soup.head.prettify())
print("22信计1班23号\n")

# 获取body标签对象
print(soup.body.prettify())

# 获取id为first的对象
first_p = soup.find(id="first")
print(first_p)

# 获取打印中文字符
pattern = re.compile(u'[\u4e00-\u9fff]+')
chinese_chars = pattern.findall(text)
print(chinese_chars)

运行结果

 

 

中国大学排名

import requests
from bs4 import BeautifulSoup
import csv

all_univ = []


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return ""


def fill_univ_list(soup):
    data = soup.find_all('tr')
    for tr in data:
        ltd = tr.find_all('td')
        if len(ltd) < 5:
            continue
        single_univ = [ltd[0].string.strip(), ltd[1].find('a', 'name-cn').string.strip(), ltd[2].text.strip(),
                       ltd[4].string.strip()]
        all_univ.append(single_univ)


def print_univ_list(num):
    file_name = "大学排行.csv"
    print("{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}".format("排名", "学校名称", "省市", "总分", chr(12288)))
    with open(file_name, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["排名", "学校名称", "省市", "总分"])
        for i in range(num):
            u = all_univ[i]
            writer.writerow(u)
            print("{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}".format(u[0], u[1], u[2], u[3], chr(12288)))


def main(num):
    url = "https://www.shanghairanking.cn/rankings/bcur/201911.html"
    html = get_html_text(url)
    soup = BeautifulSoup(html, features="html.parser")
    fill_univ_list(soup)
    print_univ_list(num)

print("22信计1班23号")
main(20)

运行结果

 转为csv文件

import csv

data = [
    ["排名", "学校名称", "省市", "总分"],
    [1, "清华大学", "北京", 94.6],
    [2, "北京大学", "北京", 76.5],
    [3, "浙江大学", "浙江", 72.9],
    [4, "上海交通大学", "上海", 72.1],
    [5, "复旦大学", "上海", 65.6],
    [6, "中国科学技术大学", "安徽", 60.9],
    [7, "华中科技大学", "湖北", 58.9],
    [7, "南京大学", "江苏", 58.9],
    [9, "中山大学", "广东", 58.2],
    [10, "哈尔滨工业大学", "黑龙江", 56.7],
    [11, "北京航空航天大学", "北京", 56.3],
    [12, "武汉大学", "湖北", 56.2],
    [13, "同济大学", "上海", 55.7],
    [14, "西安交通大学", "陕西", 55.0],
    [15, "四川大学", "四川", 54.4],
    [16, "北京理工大学", "北京", 54.0],
    [17, "东南大学", "江苏", 53.6],
    [18, "南开大学", "天津", 52.8],
    [19, "天津大学", "天津", 52.3],
    [20, "华南理工大学", "广东", 52.0]
]

with open('university_ranking.csv', 'w', newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)
    writer.writerows(data)