#爬虫中国大学排名 import re import pandas as pd import requests from bs4 import BeautifulSoup allUniv = [] def getHTMLText(url): try: r = requests.get(url, timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text except: return "" def fillUnivList(soup): soup.encode('utf-8') data = soup.find_all('tr') list1=[] for tr in data: ltd = tr.find_all('td') if len(ltd) == 0: continue singleUniv = [] for td in ltd: temp=re.findall('[\u4e00-\u9fff]+' ,str(td)) if td.string!=None and td.string!="[]": singleUniv.append(td.string) if temp!=[]: if type(temp)==list: str1='' for i in temp: str1+=i singleUniv.append(str1) allUniv.append(singleUniv) return allUniv def printUnivList(num): print("{:^5}{:^4}{:^5}{:^10}{:^10}".format("排名", "学校名称", "省市", "类型", "总分")) for i in range(num): u = allUniv[i] u[0]=u[0][29:31] u[1]=u[1][:4] u[4]=u[4][25:31] print("{:^5} {:^4}{:^5}{:^10}{:^10}".format(u[0], u[1], u[2], u[3], u[4])) def main(flag): url = 'https://www.shanghairanking.cn/rankings/bcur/201611' html = getHTMLText(url) soup = BeautifulSoup(html, "html.parser") list1=fillUnivList(soup) if flag==0: printUnivList(10) else: return list1 def combination(list1,count): list2=[] for i in list1: list2.append(i[count]) return list2 main(0) list1=main(1) def deal_data(list1): list_1=combination(list1,0) list_2=combination(list1,1) list_3=combination(list1,2) list_4=combination(list1,3) list_5=combination(list1,4) data = pd.DataFrame({ "排名": list_1, "学校名称": list_2, '省市': list_3, '类型': list_4, '总分': list_5 }) return data data=deal_data(list1) data.to_csv('University_grade.csv',index=False)
爬虫作业—2022310143137—黄志涛
发布时间 2023-12-12 14:36:00作者: 鑺鑵