HttpClient采集页面数据

发布时间 2023-09-18 19:31:38作者: yesyes1

1、导入相关依赖

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>3.3.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.13</version>
</dependency>

2、编写代码程序输出采集结果

package org.example;

import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

public class HttpClientData {

    public static String getInfoData(int pageNo) throws IOException {
        String url="https://www.lagou.com/wn/jobs";

        HttpPost httpPost=new HttpPost(url);

        httpPost.setHeader("origin","https://www.lagou.com/");
        httpPost.setHeader("cookie","JSESSIONID=ABAAABAABAGABFAAEA69FC8E6CD8BB9279BD998C897601D; WEBTJ-ID=20230918164455-18aa776cf0f95c-068f79df6bc8cf-78505774-1327104-18aa776cf102dd; RECOMMEND_TIP=true; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695026697; user_trace_token=20230918164457-cfb25219-7bd9-4f7d-9dc5-bc171648a693; LGSID=20230918164457-4880bcac-8e67-486b-8bb3-6aee05cc4147; LGUID=20230918164457-ed1f5207-a2ee-421f-922a-d4fc89bf7739; _ga=GA1.2.1911198389.1695026697; _gid=GA1.2.302334427.1695026697; sajssdk_2015_cross_new_user=1; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; gate_login_token=v1####245d299e0d1ed714bf5e0c9709c8fab2553f894311014f16e954f5fdb12a0e64; LG_HAS_LOGIN=1; _putrc=81B878065C7103C6123F89F2B170EADC; login=true; hasDeliver=0; unick=%E5%88%98%E7%B4%AB%E9%94%A6; __SAFETY_CLOSE_TIME__26494154=1; TG-TRACK-CODE=index_navigation; __lg_stoken__=16e86fbfab228abce480d7c20e2a438c1a95f406ed5488e30fe83a2948fe63626ea8863aa1c6cec421d046c81340519584424cfd74494e3b9b45bd06928ed96691bf89e7027a; X_HTTP_TOKEN=5d8ce7eae99bedb73309205961ac07a8fe736a0c27; LGRID=20230918172353-055032eb-1cb1-48cc-a0d5-12a9700fcf93; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695029038; _ga_DDLTLJDLHH=GS1.2.1695026697.1.1.1695029038.60.0.0; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2226494154%22%2C%22first_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%22117.0.0.0%22%7D%2C%22%24device_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%7D");
        httpPost.setHeader("referer","https://www.lagou.com/wn/jobs?kd=Java&city=%E5%8C%97%E4%BA%AC&pn=1");
        httpPost.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31");

        List<BasicNameValuePair> params=new ArrayList<>();

        params.add(new BasicNameValuePair("first", "true"));
        params.add(new BasicNameValuePair("pn", String.valueOf(pageNo)));
        params.add(new BasicNameValuePair("kd", "Java"));

        httpPost.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));

        CloseableHttpClient httpClient= HttpClients.createDefault();
        CloseableHttpResponse httpResponse= httpClient.execute(httpPost);

        String result= EntityUtils.toString(httpResponse.getEntity(),StandardCharsets.UTF_8);

        System.out.println("result:  "+result);

        return result;


    }
}

3、编写程序保存到hdfs中

package org.example;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

public class HttpClientData {

    public static String getInfoData(int pageNo) throws IOException {
        String url="https://www.lagou.com/wn/jobs";

        HttpPost httpPost=new HttpPost(url);

        httpPost.setHeader("origin","https://www.lagou.com/");
        httpPost.setHeader("cookie","JSESSIONID=ABAAABAABAGABFAAEA69FC8E6CD8BB9279BD998C897601D; WEBTJ-ID=20230918164455-18aa776cf0f95c-068f79df6bc8cf-78505774-1327104-18aa776cf102dd; RECOMMEND_TIP=true; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695026697; user_trace_token=20230918164457-cfb25219-7bd9-4f7d-9dc5-bc171648a693; LGSID=20230918164457-4880bcac-8e67-486b-8bb3-6aee05cc4147; LGUID=20230918164457-ed1f5207-a2ee-421f-922a-d4fc89bf7739; _ga=GA1.2.1911198389.1695026697; _gid=GA1.2.302334427.1695026697; sajssdk_2015_cross_new_user=1; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; gate_login_token=v1####245d299e0d1ed714bf5e0c9709c8fab2553f894311014f16e954f5fdb12a0e64; LG_HAS_LOGIN=1; _putrc=81B878065C7103C6123F89F2B170EADC; login=true; hasDeliver=0; unick=%E5%88%98%E7%B4%AB%E9%94%A6; __SAFETY_CLOSE_TIME__26494154=1; TG-TRACK-CODE=index_navigation; __lg_stoken__=16e86fbfab228abce480d7c20e2a438c1a95f406ed5488e30fe83a2948fe63626ea8863aa1c6cec421d046c81340519584424cfd74494e3b9b45bd06928ed96691bf89e7027a; X_HTTP_TOKEN=5d8ce7eae99bedb73309205961ac07a8fe736a0c27; LGRID=20230918172353-055032eb-1cb1-48cc-a0d5-12a9700fcf93; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695029038; _ga_DDLTLJDLHH=GS1.2.1695026697.1.1.1695029038.60.0.0; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2226494154%22%2C%22first_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%22117.0.0.0%22%7D%2C%22%24device_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%7D");
        httpPost.setHeader("referer","https://www.lagou.com/wn/jobs?kd=Java&city=%E5%8C%97%E4%BA%AC&pn=1");
        httpPost.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31");

        List<BasicNameValuePair> params=new ArrayList<>();

        params.add(new BasicNameValuePair("first", "true"));
        params.add(new BasicNameValuePair("pn", String.valueOf(pageNo)));
        params.add(new BasicNameValuePair("kd", "Java"));

        httpPost.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));

        CloseableHttpClient httpClient= HttpClients.createDefault();
        CloseableHttpResponse httpResponse= httpClient.execute(httpPost);

        String result= EntityUtils.toString(httpResponse.getEntity(),StandardCharsets.UTF_8);

        System.out.println("result:  "+result);

        return result;


    }


    //将采集结果保存到HDFS中
    public static void getInfoDataHdfs(String result) throws IOException, InterruptedException {

        //1--创建hadoop的配置对象
        Configuration configuration=new Configuration();

        //2--通过配置对象获取HDFS文件系统对象
        FileSystem fileSystem= FileSystem.get(URI.create("hdfs://node:9000"),configuration,"root");

        //3--创建HDFS数据文件保存地址
        Path filePath=new Path("/lagou/"+ LocalDate.now().format(DateTimeFormatter.ofPattern("yyyymmdd")));

        //4--使用UUID生成文件名称
        String fileName= UUID.randomUUID().toString().concat(".json");

        //5--获取HDFS输出流
        FSDataOutputStream fsDataOutputStream=fileSystem.create(new Path(filePath,fileName));

        //6--使用HDFS IO工具将数据文件上传到HDFS的指定路径
        IOUtils.copyBytes(new ByteArrayInputStream(result.getBytes(StandardCharsets.UTF_8)), fsDataOutputStream,configuration,true);

        fileSystem.close();

        System.out.println("保存成功~~");
    }

}

Main方法:

package org.example;

import java.io.IOException;

public class Main {
    public static void main(String[] args) throws IOException, InterruptedException {
        for(int i=0;i<=30;i++){
            String result=HttpClientData.getInfoData(i);

            HttpClientData.getInfoDataHdfs(result);

            Thread.sleep(1000);
        }
    }
}