微服务学的蛮多的 笔记没传 就这样吧
爬虫 基础01
1.爬虫分类
通用爬虫:
聚焦爬虫
功能爬虫
增量式爬虫
分布式爬虫
2.requests基础操作
1.环境安装
案例1 搜狗首页数据和持续存储
import requests
url = 'https://www.sogou.com/'
r = requests.get(url)
r.encoding = 'utf-8' # 指定响应的编码方式为 GBK
page_text = r.text
with open('sogou.html', 'w', encoding='utf8') as fp:
fp.write(page_text)
print("获取数据成功并写入文件")
案例2 搜索具体搜索代码
https://www.sogou.com/web?query=jay
//1.利用字典动态插入 数据 伪造UA进行简单反爬处理
import requests
url = 'https://www.sogou.com/web'
keyword=input('请输入你要搜索的内容:')
pram={
'query':keyword #如果后续还有数据可以进行动态插参数
}
head={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
}
r = requests.get(url,params=pram,headers=head)
r.encoding = 'utf-8' # 指定响应的编码方式为 GBK
page_text = r.text
fileName = keyword + '.html'
with open(fileName, 'w', encoding='utf8') as fp:
fp.write(page_text)
print("获取数据成功并写入文件")
案例3 豆瓣爱情电影爬取
//emm 其实也就是找对正确的接口 --因为某些是动态加载数据 然后经过前端呈现
https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=
真实地址
https://movie.douban.com/j/chart/top_list
?
type=13&interval_id=100%3A90&action=&start=0&limit=20
r = requests.get(url,params=pram,headers=head)
r.encoding = 'utf-8' # 指定响应的编码方式为 GBK
page_text = r.text
page_textjson = r.json()
fileName = "豆瓣爱情故事" + '.html'
fp = open(fileName,'w')
for dic in page_textjson:
title = dic['title']
score = dic['score']
fp.write(title+':'+score+'\n')
print("获取数据成功并写入文件")
案例4 肯德基地址查询
//就是获取分页查询的数据
import requests
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
keyword=input("请输入查询的模糊地址:")
pram = {
"cname": "",
"pid": "",
"keyword": keyword,
"pageIndex": "1",
"pageSize": "10",
}
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
}
r = requests.get(url, params=pram, headers=head)
r.encoding = 'utf-8' # 指定响应的编码方式为 GBK
page_textjson = r.json()
print(page_textjson)
fileName = "肯德基爷爷" + '.html'
fp = open(fileName, 'w')
for dic in page_textjson['Table1']:
title = dic['storeName']
score = dic['addressDetail']
zhichi = dic["pro"]
fp.write("商店名字" + title + ':' + "商店地址" + score + "娱乐支持" + zhichi + '\n')
print("获取数据成功并写入文件")
案例4**局
某些原因 不敢运行 不敢测试效果
批量获取 先写通过ID获取的爬虫模拟 再去首页分页查询拿到ID值
就可以得到了
3.聚焦爬虫
1.图片资源爬取
1.二进制写
r = requests.get(url, headers=head)
img_data=r.content
with open('./123.jpg','wb') as fp:
fp.write(img_data)
2.导入模块自动读写
request.urlretrieve(url,'./456.jpg')
2.正则使用
url http://img.itlun.cn/news/60e799893.html
将该页面的所有图片进行保存
import requests
from urllib import request
import re
url = 'http://img.itlun.cn/news/60e799893.html'
pram = {
"cname": "",
"pid": "",
"pageIndex": "1",
"pageSize": "10",
}
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
}
r=requests.get(url,headers=head)
r.encoding = 'gbk'
page_text = r.text
#写正则拿出对方的信息
ex = r'<p[^>]*>\s*<img[^>]*\s+src="([^"]+)"'
img_src_list = re.findall(ex,page_text,re.S)
for img_src in img_src_list:
img_name=img_src.split('/')[-1]#拿取名字
request.urlretrieve(img_src,img_name)
3.bs4 数据提取
bs4安装
from bs4 import BeautifulSoup
fp = open('test.html', 'r', encoding='utf-8')
# 解析本地内容
soup = BeautifulSoup(fp, 'lxml')
# 开始解析
# 标签定位 只会定义到第一个符号的标签
tag1 = soup.title
tag2 = soup.div
#属性定位 #find('tagName',attrName='attrValue'):find只会定位到满足要的第一个标签
tag3=soup.find('div',class_='song')
tag4=soup.findAll('a',id='feng')
#选择器定位
#定位标签 id
tag6=soup.select('#feng')
tag7=soup.select('.song')
#层级选择器
tag8=soup.select('.tang>ul')
tag9=soup.select('.tang li')
#提取标签中的内容
# #tag.string:只可以提取到标签中直系的文本内容
#tag.text:可以提取到标签中所有的文本内容
p_tag=soup.p
print(p_tag.String)
print(p_tag.text)
div_tag=soup.find('div',class_='song')
print('分隔符')
img_tag = soup.img
print(img_tag['src'])
img_taga = soup.a
print(img_taga['href'])
案例1 小说爬取
url https://www.shicimingju.com/book/sanguoyanyi.html
数据解析 1.请求主页数据 2.拿出标题和对应链接 3.拿出内容
from bs4 import BeautifulSoup
import requests
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
main_url="https://www.shicimingju.com/book/sanguoyanyi.html"
#拿去Li
response=requests.get(url=main_url,headers=headers)
response.encoding = 'utf-8'
page_text=response.text
soup=BeautifulSoup(page_text,'lxml')
listmap=soup.select('.book-mulu >ul >li > a')
fp=open('三国演绎.txt','w',encoding='utf-8')
for a in listmap:
title=a.string
detail_url='https://www.shicimingju.com'+a['href']
#继续发送数据
response=requests.get(detail_url,headers=headers)
response.encoding='utf-8'
neirong=response.text
#解析章节内容
d_soup=BeautifulSoup(neirong,'lxml')
text=d_soup.find('div',class_='card bookmark-list')
text=text.text
fp.write(text+'\n')
print(title,"爬取成功")
fp.close()
案例2 IP提取
import string
from bs4 import BeautifulSoup
import requests
#一页有12
i=input("请输入你要获取多少IP")
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
main_url="https://www.kuaidaili.com/free/"
# 第2页 https://www.kuaidaili.com/free/dps/2/
response=requests.get(main_url,headers=headers)
response.encoding='utf-8'
page_text=response.text
soup=BeautifulSoup(page_text,'lxml')
souppage=soup.select('tbody > tr')
for tr in souppage:
td=tr.findAll('td')[0]
tda = tr.findAll('td')[1]
ip=td.string
port=tda.string
print(ip,port)
num = int(i)-12
var = num / 12
mage=int(var)
for i in range(mage):
# 第2页 https://www.kuaidaili.com/free/dps/2/
if i>=2:
url='https://www.kuaidaili.com/free/dps/'+str(i)+'/'
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
page_text = response.text
soup = BeautifulSoup(page_text, 'lxml')
souppage = soup.select('tbody > tr')
for tr in souppage:
td = tr.findAll('td')[0]
tda = tr.findAll('td')[1]
ip = td.string
port = tda.string
print(ip, port)
4.xpath 数据解析
1.环境安装
2.编码流程
创建一个etree类型的对象 -把页面源码加载到该对象中
调用etree对象的xpath的方法结合xpath的表达式 进行标签定位和数据提取
3.html的树状结构 --也就是方便查找 -为后续xpath数据解析打下基础、
4.标签定位
比如我们现在要拿出bs4
from lxml.html import etree
with open("test.html", encoding="gbk", errors="ignore") as file:
tree = etree.parse(file)
# 标签定位
title = tree.xpath('//title') # //全局标签 只有有这个标签就直接查询到
titlea = tree.xpath('/html/head/title') # 层级查询 最为准确
titleb = tree.xpath('/html//title') # //表示跨越多个层级
titlec = tree.xpath('//head/title') #
print(titlea, titleb, titlec, title)
#最左侧为/:表示必须从树的根标签(html标签)开始进行定位
#最左侧为//:可以从任意位置进行标签的相对位置定位
#非最左侧的/:表示一个层级
#非最左侧的//:表示多个层级
案例 图片爬取
from lxml.html import etree
import requests
import os
dir_name = 'girls'
if not os.path.exists(dir_name):
os.makedirs(dir_name)
# 一页张
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
url = 'https://pic.netbian.com/4kmeinv/index_%d.html'
for page in range(1, 3):
if page==1:
newurl='https://pic.netbian.com/4kmeinv/index.html'
else:
newurl=format(url%page)
print('----------正在请求下载第%d页的图片数据----------' % page)
response = requests.get(newurl, headers=headers)
response.encoding = 'gbk'
page_text = response.text
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@class="clearfix"]/li')
for li in li_list:
img_title = li.xpath('./a/b/text()')[0] + '.jpg'
img_url = "http://pic.netbian.com" + li.xpath('./a/img/@src')[0]
print(img_title, img_url)
# 发起循环请求拿取图片
img_data = requests.get(img_url, headers=headers).content
img_path = dir_name + '/' + img_title
with open(img_path, 'wb') as fp:
fp.write(img_data)
print(img_title, '下载保存成功!')
城市质量网案例
from lxml.html import etree
import requests
import os
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
url='https://www.aqistudy.cn/historydata/'
page_text=requests.get(url,headers=headers).text
tree=etree.HTML(page_text)
hot_list=tree.xpath('//div[@class="hot"]//ul[@class="unstyled"]/li')
for li in hot_list:
cityname=li.xpath('./a/text()')[0]
print(cityname)
hot_lista=tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
for li in hot_lista:
city_name =li.xpath('./a/text()')[0]
print(city_name)
站长简历网站爬取
from lxml.html import etree
import requests
import os
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
url='https://sc.chinaz.com/jianli/free.html'
re=requests.get(url,headers=headers)
re.encoding = 'utf-8'
page_text = re.text
#数据解析:简历名称+详情页的url
tree=etree.HTML(page_text)
treepage=tree.xpath("/html/body/div[4]/div[3]/div/div/div/p")
for mubiao in treepage:
title=mubiao.xpath('./a/text()')[0]+'.rar'
url=mubiao.xpath('./a/@href')[0]
#print(title,url)
re = requests.get(url, headers=headers)
re.encoding = 'utf-8'
page_text = re.text
tree = etree.HTML(page_text)
url = tree.xpath('//*[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
data = requests.get(url=url, headers=headers).content
with open(title, 'wb') as fp:
fp.write(data)
print(title, '保存下载成功!')
搜狐精简图片文章爬取、
div style="overflow: hidden; zoom: 1; float: left;"
任何完整路径正则会查询不到 因为该div在返回的是数据中是没有的 但是前端确实存在
总结 爬虫
1.了解爬虫
2.了解request
3.了解正则
4.了解xpath