由于蚂蚁老师课程视频中博客园网站更新,代码不适用于现有环境,故网上查找更新:网上爬取博客园文章列表

发布时间 2023-10-08 14:21:19作者: 慕雨师一
import json
import re
import requests
from bs4 import BeautifulSoup

fOut = open("博客爬取文章列表标题及地址.txt", "w", encoding="utf8")
for idx in range(20):
print("#" * 50, idx + 1)
url = "https://www.cnblogs.com/AggSite/AggSitePostList"
data = {
"CategoryType": "SiteHome",
"ParentCategoryId": 0,
"CategoryId": 808,
"PageIndex": idx + 1,
"TotalPostCount": 4000,
"ItemListActionName": "AggSitePostList"
}
headers = {
"Content-Type": "application/json; charset=UTF-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.47"
}
resp = requests.post(url, data=json.dumps(data), headers=headers, timeout=3)
# print(resp.text)
if resp.status_code != 200:
print(resp.status_code)
raise Exception()
soup = BeautifulSoup(resp.text, "html.parser")
post_items = soup.find_all("article", class_="post-item")
for post_item in post_items:
link = post_item.find("a", class_="post-item-title")
href = link.get("href")
text = link.get_text()
span = post_item.find("span", id=re.compile(r"^digg"))
number = span.get_text()
print(href, text, number)

# 写出文件
fOut.write("%s\t%s\n%s\t" % (href, text, number))
fOut.flush()
print("success:%s, %s, %s" % (href, text, number))
fOut.close()