改进了headers的爬虫(Cookies)

发布时间 2023-09-18 20:12:23作者: 热爱工作的宁致桑
import urllib.request
from lxml import etree
def create_request(page):
    if page == 1:
        url = 'http://www.chinaeol.net/hjxw/gnxw'
    else:
        url = 'http://www.chinaeol.net/hjxw/gnxw/index_' + str(page) + '.shtml'
    headers = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        #'Accept-Encoding':'gzip, deflate',
        #'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
        #'Cache-Control':'max-age=0',
        'Cookie': 'Hm_lvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lpvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lvt_fb9e17abd59ff9a2c324890c5a701eca=1695037543; Hm_lvt_2ed05369c38555b813edc07a4dc8e126=1695037543; Hm_lpvt_fb9e17abd59ff9a2c324890c5a701eca=1695038268; Hm_lpvt_2ed05369c38555b813edc07a4dc8e126=1695038268',
        'Host': 'www.chinaeol.net',
        'Proxy-Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
    }
    request = urllib.request.Request(url=url,headers=headers)
    return request

def get_content(request):
    response = urllib.request.urlopen(request)
    content = response.read()
    return content

def get_hreflist(content):
    url_list = []
    tree = etree.HTML(content)
    href_list = tree.xpath('//ul[@class="cj_tianlibu"]//a/@href')
    href_list = [item for item in href_list if item != "javascript:;"]
    url = 'http://www.chinaeol.net/hjxw/gnxw/'
    for i in range(len(href_list)):
        new_url = url + href_list[i]
        url_list.append(new_url)
    return url_list
    

def download_text(url_list):
    failed_page_num = 0
    for url in url_list:
        #try:
        headers = {
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            #'Accept-Encoding':'gzip, deflate',
            #'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
            #'Cache-Control':'max-age=0',
            'Cookie': 'Hm_lvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lpvt_0960aaf0c90823ef3de3f164788e264f=1695037542; Hm_lvt_fb9e17abd59ff9a2c324890c5a701eca=1695037543; Hm_lvt_2ed05369c38555b813edc07a4dc8e126=1695037543; Hm_lpvt_fb9e17abd59ff9a2c324890c5a701eca=1695038268; Hm_lpvt_2ed05369c38555b813edc07a4dc8e126=1695038268',
            'Host': 'www.chinaeol.net',
            'Proxy-Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36'
        }
        request = urllib.request.Request(url=url,headers=headers)
        response = urllib.request.urlopen(request)
        content = response.read()
        tree = etree.HTML(content)
        name = tree.xpath('//span[@class="toptitle"]/text()')[0]+'.txt'
        name = name.replace("/","")
        save_path = './生态环境部宣传教育中心/国内新闻/'+name
        text = tree.xpath('//div[@class="TRS_Editor"]//span/text()')
        result = ''
        for t in text:
            result = result + '\n' + t
        with open(save_path,'w') as fp:
            fp.write(result)

        '''except:
            failed_page_num += 1
            print("{} pages failed in this page".format(failed_page_num))
            pass'''

        

if __name__ == '__main__':
    start_page = 1
    end_page = 1
    
    for page in range(start_page,end_page+1):

        request = create_request(page)  # 导入了第page页
        content = get_content(request)  # 获得第page页的源代码
        url_list = get_hreflist(content) # 获得第page页所有的新闻链接
        #download_text(url_list) #下载第page页所有的新闻文本
        print('' + str(page) + '页下载完成')

        #except:
           # print("failed to reach page {}".format(page))