爬虫单元作业

发布时间 2023-12-25 19:16:52作者: 李想2022310143126

(2)请用requests库的get()函数访问如下一个网站20次,打印返回状态,text()内容,计算text()属性和content属性所返回网页内容的长度。(不同学号选做如下网页,必做及格)

import requests

url = "https://www.baidu.com/"

for i in range(20):
    response = requests.get(url)
    print("第{}次访问:".format(i + 1))
    print("返回状态:", response.status_code)
    print("text()内容:", response.text)
    print("text()内容长度:", len(response.text))
    print("content属性所返回网页内容长度:", len(response.content))
    print("")

 

(3)这是一个简单的html页面,请保持为字符串,完成后面的计算要求。(良好)

from bs4 import BeautifulSoup

html_doc = """
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>菜鸟教程(runoob.com)</title>
</head>
<body>
<h1>我的第一个标题</h1>
<p id="first">我的第一个段落。</p>
</body>
<table border="1">
<tr>
<td>row 1, cell 1</td>
<td>row 1, cell 2</td>
</tr>
<tr>
<td>row 2, cell 1</td>
<td>row 2, cell 2</td>
</tr>
</table>
</html>
"""

print("学号:26")
soup = BeautifulSoup(html_doc, 'html.parser')
print("a:", soup.head)
print("b:", soup.body)
first_element = soup.find(id="first")
print("c:", first_element)
chinese_characters = "".join([char for char in html_doc if '\u4e00' <= char <= '\u9fff'])
print("d:", chinese_characters)

 

(4) 爬中国大学排名网站内容,‪‬‪‬‪‬‪‬‪‬‮‬‪‬‫‬‪‬‪‬‪‬‪‬‪‬‮‬‪‬‮‬‪‬‪‬‪‬‪‬‪‬‮‬‪‬‮‬‪‬‪‬‪‬‪‬‪‬‮‬‭‬‪‬‪‬‪‬‪‬‪‬‪‬‮‬‫‬‫‬‪‬‪‬‪‬‪‬‪‬‮‬‫‬‪‬‪‬‪‬‪‬‪‬‪‬‮‬‫‬‫‬

https://www.shanghairanking.cn/rankings/bcur/201811

import requests
from bs4 import BeautifulSoup
import csv

# 请求网页内容
url = "https://www.shanghairanking.cn/rankings/bcur/201611"
response = requests.get(url)
response.encoding = 'utf-8'
html_content = response.text

# 解析网页内容
soup = BeautifulSoup(html_content, 'html.parser')
table = soup.find('table', {'class': 'rk-table'})
rows = table.find_all('tr')[1:]  # 跳过表头

# 提取大学名称和序号
data = []
for row in rows:
    cols = row.find_all('td')
    university_name = cols[1].find('a', class_='name-cn').text.strip()
    university_rank = cols[0].text.strip()
    data.append([university_name, university_rank])

# 保存为csv文件
with open('university_ranking_2016.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['大学名称', '排序'])  # 写入表头
    writer.writerows(data)  # 写入数据