爬虫01

发布时间 2024-01-03 23:31:51作者: 大橘|博客

微服务学的蛮多的 笔记没传 就这样吧

爬虫 基础01

1.爬虫分类

通用爬虫:

聚焦爬虫

功能爬虫

增量式爬虫

分布式爬虫

2.requests基础操作

1.环境安装

案例1 搜狗首页数据和持续存储
import requests


url = 'https://www.sogou.com/'

r = requests.get(url)
r.encoding = 'utf-8' # 指定响应的编码方式为 GBK

page_text = r.text

with open('sogou.html', 'w', encoding='utf8') as fp:
    fp.write(page_text)

print("获取数据成功并写入文件")
案例2 搜索具体搜索代码

https://www.sogou.com/web?query=jay

//1.利用字典动态插入 数据 伪造UA进行简单反爬处理
import requests


url = 'https://www.sogou.com/web'
keyword=input('请输入你要搜索的内容:')
pram={

    'query':keyword #如果后续还有数据可以进行动态插参数
}



head={

    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
}
r = requests.get(url,params=pram,headers=head)
r.encoding = 'utf-8' # 指定响应的编码方式为 GBK
page_text = r.text
fileName = keyword + '.html'
with open(fileName, 'w', encoding='utf8') as fp:
    fp.write(page_text)

print("获取数据成功并写入文件")
案例3 豆瓣爱情电影爬取
//emm 其实也就是找对正确的接口 --因为某些是动态加载数据 然后经过前端呈现
https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85&type=13&interval_id=100:90&action=

真实地址
https://movie.douban.com/j/chart/top_list
?
type=13&interval_id=100%3A90&action=&start=0&limit=20
r = requests.get(url,params=pram,headers=head)
r.encoding = 'utf-8' # 指定响应的编码方式为 GBK

page_text = r.text
page_textjson = r.json()
fileName = "豆瓣爱情故事" + '.html'
fp = open(fileName,'w')
for dic in page_textjson:
    title = dic['title']
    score = dic['score']
    fp.write(title+':'+score+'\n')


print("获取数据成功并写入文件")

案例4 肯德基地址查询

//就是获取分页查询的数据
import requests

url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
keyword=input("请输入查询的模糊地址:")
pram = {
    "cname": "",
    "pid": "",
    "keyword": keyword,
    "pageIndex": "1",
    "pageSize": "10",
}

head = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
}
r = requests.get(url, params=pram, headers=head)
r.encoding = 'utf-8'  # 指定响应的编码方式为 GBK
page_textjson = r.json()
print(page_textjson)
fileName = "肯德基爷爷" + '.html'
fp = open(fileName, 'w')
for dic in page_textjson['Table1']:
    title = dic['storeName']
    score = dic['addressDetail']
    zhichi = dic["pro"]
    fp.write("商店名字" + title + ':' + "商店地址" + score + "娱乐支持" + zhichi + '\n')

print("获取数据成功并写入文件")

案例4**局
某些原因 不敢运行 不敢测试效果 
批量获取 先写通过ID获取的爬虫模拟 再去首页分页查询拿到ID值
就可以得到了 

3.聚焦爬虫

1.图片资源爬取

1.二进制写

r = requests.get(url, headers=head)
img_data=r.content

with open('./123.jpg','wb') as fp:
    fp.write(img_data)

2.导入模块自动读写

request.urlretrieve(url,'./456.jpg')
2.正则使用

url http://img.itlun.cn/news/60e799893.html

将该页面的所有图片进行保存

爆笑的爆笑笑话段子

import requests
from urllib import request
import re
url = 'http://img.itlun.cn/news/60e799893.html'
pram = {
    "cname": "",
    "pid": "",

    "pageIndex": "1",
    "pageSize": "10",
}

head = {

    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
}
r=requests.get(url,headers=head)
r.encoding = 'gbk'
page_text = r.text
#写正则拿出对方的信息
ex = r'<p[^>]*>\s*<img[^>]*\s+src="([^"]+)"'

img_src_list = re.findall(ex,page_text,re.S)

for img_src in img_src_list:
    img_name=img_src.split('/')[-1]#拿取名字
    request.urlretrieve(img_src,img_name)
3.bs4 数据提取

bs4安装

from bs4 import BeautifulSoup

fp = open('test.html', 'r', encoding='utf-8')

# 解析本地内容
soup = BeautifulSoup(fp, 'lxml')

# 开始解析
# 标签定位 只会定义到第一个符号的标签
tag1 = soup.title

tag2 = soup.div



#属性定位  #find('tagName',attrName='attrValue'):find只会定位到满足要的第一个标签
tag3=soup.find('div',class_='song')
tag4=soup.findAll('a',id='feng')



#选择器定位
#定位标签 id
tag6=soup.select('#feng')
tag7=soup.select('.song')


#层级选择器
tag8=soup.select('.tang>ul')


tag9=soup.select('.tang  li')


#提取标签中的内容
# #tag.string:只可以提取到标签中直系的文本内容
#tag.text:可以提取到标签中所有的文本内容

p_tag=soup.p
print(p_tag.String)
print(p_tag.text)
div_tag=soup.find('div',class_='song')


print('分隔符')
img_tag = soup.img
print(img_tag['src'])
img_taga = soup.a
print(img_taga['href'])
案例1 小说爬取

url https://www.shicimingju.com/book/sanguoyanyi.html

数据解析 1.请求主页数据 2.拿出标题和对应链接 3.拿出内容

from bs4 import BeautifulSoup
import requests


headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
main_url="https://www.shicimingju.com/book/sanguoyanyi.html"

#拿去Li
response=requests.get(url=main_url,headers=headers)
response.encoding = 'utf-8'
page_text=response.text
soup=BeautifulSoup(page_text,'lxml')
listmap=soup.select('.book-mulu >ul >li > a')
fp=open('三国演绎.txt','w',encoding='utf-8')
for a in listmap:
    title=a.string
    detail_url='https://www.shicimingju.com'+a['href']
    #继续发送数据
    response=requests.get(detail_url,headers=headers)
    response.encoding='utf-8'
    neirong=response.text

    #解析章节内容
    d_soup=BeautifulSoup(neirong,'lxml')
    text=d_soup.find('div',class_='card bookmark-list')
    text=text.text
    fp.write(text+'\n')
    print(title,"爬取成功")


fp.close()
案例2 IP提取
import string

from bs4 import BeautifulSoup
import requests
#一页有12

i=input("请输入你要获取多少IP")
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}
main_url="https://www.kuaidaili.com/free/"
# 第2页 https://www.kuaidaili.com/free/dps/2/
response=requests.get(main_url,headers=headers)
response.encoding='utf-8'
page_text=response.text

soup=BeautifulSoup(page_text,'lxml')

souppage=soup.select('tbody > tr')

for tr in souppage:
    td=tr.findAll('td')[0]
    tda = tr.findAll('td')[1]
    ip=td.string
    port=tda.string
    print(ip,port)
num = int(i)-12
var = num / 12
mage=int(var)
for i in range(mage):
  # 第2页 https://www.kuaidaili.com/free/dps/2/
  if i>=2:
   url='https://www.kuaidaili.com/free/dps/'+str(i)+'/'
   response = requests.get(url, headers=headers)
   response.encoding = 'utf-8'
   page_text = response.text
   soup = BeautifulSoup(page_text, 'lxml')
   souppage = soup.select('tbody > tr')
   for tr in souppage:
       td = tr.findAll('td')[0]
       tda = tr.findAll('td')[1]
       ip = td.string
       port = tda.string
       print(ip, port)
4.xpath 数据解析

1.环境安装

2.编码流程

创建一个etree类型的对象 -把页面源码加载到该对象中

调用etree对象的xpath的方法结合xpath的表达式 进行标签定位和数据提取

3.html的树状结构 --也就是方便查找 -为后续xpath数据解析打下基础、

4.标签定位

image-20240101233227629

比如我们现在要拿出bs4

from lxml.html import etree

with open("test.html", encoding="gbk", errors="ignore") as file:

  tree = etree.parse(file)

# 标签定位
title = tree.xpath('//title')  # //全局标签 只有有这个标签就直接查询到

titlea = tree.xpath('/html/head/title')  # 层级查询 最为准确

titleb = tree.xpath('/html//title')  # //表示跨越多个层级

titlec = tree.xpath('//head/title')  #

print(titlea, titleb, titlec, title)
#最左侧为/:表示必须从树的根标签(html标签)开始进行定位
#最左侧为//:可以从任意位置进行标签的相对位置定位
#非最左侧的/:表示一个层级
#非最左侧的//:表示多个层级

案例 图片爬取
from lxml.html import etree
import requests
import os

dir_name = 'girls'
if not os.path.exists(dir_name):
    os.makedirs(dir_name)

# 一页张
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}

url = 'https://pic.netbian.com/4kmeinv/index_%d.html'

for page in range(1, 3):

    if page==1:
        newurl='https://pic.netbian.com/4kmeinv/index.html'
    else:
        newurl=format(url%page)
    print('----------正在请求下载第%d页的图片数据----------' % page)
    response = requests.get(newurl, headers=headers)

    response.encoding = 'gbk'

    page_text = response.text

    tree = etree.HTML(page_text)

    li_list = tree.xpath('//ul[@class="clearfix"]/li')

    for li in li_list:
        img_title = li.xpath('./a/b/text()')[0] + '.jpg'
        img_url = "http://pic.netbian.com" + li.xpath('./a/img/@src')[0]
        print(img_title, img_url)
        # 发起循环请求拿取图片
        img_data = requests.get(img_url, headers=headers).content
        img_path = dir_name + '/' + img_title
        with open(img_path, 'wb') as fp:
            fp.write(img_data)
            print(img_title, '下载保存成功!')

城市质量网案例
from lxml.html import etree
import requests
import os
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}

url='https://www.aqistudy.cn/historydata/'
page_text=requests.get(url,headers=headers).text

tree=etree.HTML(page_text)
hot_list=tree.xpath('//div[@class="hot"]//ul[@class="unstyled"]/li')
for li in hot_list:
    cityname=li.xpath('./a/text()')[0]
    print(cityname)

hot_lista=tree.xpath('//div[@class="bottom"]/ul/div[2]/li')

for li in hot_lista:
    city_name =li.xpath('./a/text()')[0]
    print(city_name)
站长简历网站爬取
from lxml.html import etree
import requests
import os
headers = {
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
}

url='https://sc.chinaz.com/jianli/free.html'
re=requests.get(url,headers=headers)
re.encoding = 'utf-8'
page_text = re.text
#数据解析:简历名称+详情页的url
tree=etree.HTML(page_text)
treepage=tree.xpath("/html/body/div[4]/div[3]/div/div/div/p")
for mubiao in treepage:
    title=mubiao.xpath('./a/text()')[0]+'.rar'
    url=mubiao.xpath('./a/@href')[0]
    #print(title,url)
    re = requests.get(url, headers=headers)
    re.encoding = 'utf-8'
    page_text = re.text
    tree = etree.HTML(page_text)
    url = tree.xpath('//*[@id="down"]/div[2]/ul/li[1]/a/@href')[0]
    data = requests.get(url=url, headers=headers).content
    with open(title, 'wb') as fp:
        fp.write(data)
    print(title, '保存下载成功!')
搜狐精简图片文章爬取、

div style="overflow: hidden; zoom: 1; float: left;"

任何完整路径正则会查询不到 因为该div在返回的是数据中是没有的 但是前端确实存在

总结 爬虫

1.了解爬虫

2.了解request

3.了解正则

4.了解xpath

5.了解bs4
https://kxd.lanzoul.com/iUyOs1jx9y6d