爬虫基本工具:urllib丶requests丶selenium丶pytesseract

发布时间 2023-07-27 14:44:30作者: 看一百次夜空里的深蓝

urllib来实现cookie和ip代理

  1 from urllib.request import Request, build_opener, urlopen
  2 from fake_useragent import UserAgent
  3 from urllib.parse import urlencode
  4 from urllib.request import HTTPCookieProcessor
  5 from http.cookiejar import CookieJar, FileCookieJar, MozillaCookieJar, LWPCookieJar  # 用来将cookies保存到文件
  6 # FileCookieJar 继承 CookieJar
  7 # MozillaCookieJar\LWPCookieJar继承FileCookieJar
  8 from urllib.request import ProxyHandler
  9 import ssl
 10 
 11 # get丶post请求
 12 def get_html(aurl):
 13     headers = {
 14         'User-Agent': UserAgent().chrome
 15     }
 16     # 如果Request传了data参数的话,就是post请求,没传data就是get.
 17     # 传data的时候,data先是字典类型,通过urlencode(data)编码之后为字符串,然后转字符串转成bytes,也就是str.encode
 18     request = Request(url=aurl, headers=headers)
 19     response = urlopen(request)
 20 
 21     # 忽略验证证书
 22     # ssl1 = ssl._create_unverified_context()
 23     # response = urlopen(requet, context=ssl1)
 24     return response.read()
 25 
 26 # UserAgent 浏览器头
 27 def user_agent_test():
 28     from urllib.request import Request, urlopen
 29     from random import choice
 30     from fake_useragent import UserAgent
 31     from urllib.parse import quote, urlencode
 32 
 33     url = 'http://www.baidu.com'
 34     headers = {
 35         # 模拟Chroom浏览器
 36         'User-Agent': UserAgent().chrome
 37     }
 38     request = Request(url, headers=headers)
 39     print(request.get_header('User-agent'))
 40     # 随机选择
 41     ua = UserAgent()
 42     print(choice([ua.chrome, ua.safari, ua.firefox]))
 43     print(ua.random)
 44 
 45     # 中文参数编码
 46     print(quote('中国'))
 47     adata = {
 48         'wd': '看一百次夜空里的深蓝',
 49         'ie': 'utf-8'
 50     }
 51     url = 'https://www.baidu.com/s?{}'.format(urlencode(adata))
 52     print(url)
 53 
 54 # 登录后保存cookie到文件
 55 def get_cookie():
 56     url = 'https://support.highgo.com/highgo_api/login'
 57     headers = {
 58         'User-Agent': UserAgent().chrome,
 59         # 'Cookie': ""   # 可以直接指定cookie,也可以用HTTPCookieProcessor来保存post后的cookie
 60     }
 61     form_data = {
 62         'userName': '773254968@qq.com',
 63         'password': '039ac48bbf1bdb15e52eb8eb635dc13d'
 64     }
 65     fdata = urlencode(form_data).encode()
 66     request = Request(url, headers=headers, data=fdata)
 67     # response = urlopen(request) 带cookie的话就必须使用build_opener
 68     mcj = MozillaCookieJar()
 69     handler = HTTPCookieProcessor(mcj)
 70     opener = build_opener(handler)
 71     response = opener.open(request)
 72     mcj.save('cookie.txt', ignore_expires=True, ignore_discard=True)
 73     print(response.read().decode())
 74 
 75 # 从文件中加载cookie访问
 76 def use_cookie():
 77     index_url = 'https://support.highgo.com/#/index'
 78     headers = {
 79         'User-Agent': UserAgent().chrome,
 80         # 'Cookie': ""   # 可以直接指定cookie,也可以用HTTPCookieProcessor来保存post后的cookie
 81     }
 82     mcj = MozillaCookieJar()
 83     mcj.load('cookie.txt', ignore_discard=True, ignore_expires=True)
 84     handler = HTTPCookieProcessor(mcj)
 85     opener = build_opener(handler)
 86     request = Request(index_url, headers=headers)
 87     response = opener.open(request)
 88     print(response.read().decode())
 89 
 90 # 代理
 91 def opener_test():
 92     # 代理
 93     url = 'http://www.baidu.com'
 94     headers = {
 95         'User-Agent': UserAgent().chrome
 96     }
 97     # 免费代理ip: https://www.89ip.cn/
 98     # handler = ProxyHandler({"http":"username:password@ip:port"})
 99     # handler = ProxyHandler({"http":"ip:port"})
100     handler = ProxyHandler({"http": "101.43.93.67:7890"})
101     opener = build_opener(handler)
102 
103     request = Request(url, headers=headers)
104     response = opener.open(request)
105     print(response.read().decode())
106 
107 if __name__ == '__main__':
108     use_cookie()

requests来实现cookie和ip代理

 1 import requests
 2 from fake_useragent import UserAgent
 3 
 4 def requests_get():
 5     url = 'https://support.highgo.com/#/index'
 6     response = requests.get(url)
 7     print(response.text)
 8 
 9 def requests_post():
10     url = 'https://support.highgo.com/highgo_api/login'
11     hearders = {
12         'User-agent': UserAgent().chrome
13     }
14     form_data = {
15         'userName': '773254968@qq.com',
16         'password': '039ac48bbf1bdb15e52eb8eb635dc13d'
17     }
18     response = requests.post(url, headers=hearders, data=form_data)
19     print(response.text)
20 
21 def requets_proxy():
22     url = 'https://support.highgo.com/#/index'
23     proxy = {
24         "http":"8.219.125.46:80"
25     }
26     hearders = {
27         'User-agent': UserAgent().chrome
28     }
29     response = requests.get(url, headers=hearders, proxies=proxy)
30     print(response.text)
31 
32 def requests_ssl():
33     url = 'https://www.12306.cn/mormhweb/'
34     hearders = {
35         'User-agent': UserAgent().chrome
36     }
37     requests.packages.urllib3.disable_warnings()
38     response = requests.get(url, verify=False, headers=hearders)
39     response.encoding = 'utf-8'
40     print(response.text)
41 
42 def requests_cookies():
43     url = 'https://support.highgo.com/highgo_api/login'
44     hearders = {
45         'User-agent': UserAgent().chrome
46     }
47     form_data = {
48         'userName': '773254968@qq.com',
49         'password': '039ac48bbf1bdb15e52eb8eb635dc13d'
50     }
51     session = requests.session()
52     response = session.post(url, headers=hearders, data=form_data)
53     print(response.text)
54     response = session.get('https://support.highgo.com/#/index')
55     print(response.text)
56 
57 
58 if __name__ == '__main__':
59     # requests_post()
60     # requests_get()
61     # requets_proxy()
62     # requests_ssl()
63     requests_cookies()

Selenium模拟浏览器

 1 # 安装
 2 # pip3 install selenium
 3 # 功能:完全模拟浏览器访问网站
 4 # Chrome需要配合:chromedriver
 5 # https://blog.csdn.net/weixin_45109684/article/details/117650036
 6 
 7 # PhantomJS
 8 
 9 # Chromedriver 安装
10 # 1.安装Chrome,然后在[帮助]中查看版本
11 # 2.根据版本下载chromedriver   (https://registry.npmmirror.com/binary.html?path=chromedriver/)
12 # 3.下载后的chromedriver解压后,copy到/usr/bin/目录下边
13 
14 from selenium import webdriver
15 from selenium.webdriver.common.by import By
16 
17 browser = webdriver.Chrome()
18 browser.get('https://www.baidu.com')
19 # By.XPATH 模式就是使用etree.xpath
20 # browser.find_element(By.XPATH, r"//input[@id='kw']").send_keys('看一百次夜空里的深蓝')
21 # find_elements是查找多个
22 # 要注意的是,find_element查找不到的时候会报错
23 browser.find_element(By.ID, 'kw').send_keys('看一百次夜空里的深蓝')
24 browser.find_element(By.ID, 'su').click()
25 # 执行js代码操作滚动条
26 # js = r"var q = document.getElementById('id').scrollTop = 0"
27 js = r"var q = document.documentElement.scrollTop = 1000"
28 # js = r"document.body.scrollTop=0"
29 browser.execute_script(js)
30 print(browser.page_source)
31 browser.close()

pytesseract验证码识别

1 from PIL import Image
2 import pytesseract
3 # Tesseract Ubuntu 安装 : https://tesseract-ocr.github.io/tessdoc/Installation.html
4 # sudo apt install tesseract-ocr
5 # sudo apt install libtesseract-dev
6 # github 地址: https://github.com/tesseract-ocr/tesseract
7 imageObject=Image.open('./lll.png')
8 print (imageObject)
9 print (pytesseract.image_to_string(imageObject))