1、导入相关依赖
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>
2、编写代码程序输出采集结果
package org.example;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
public class HttpClientData {
public static String getInfoData(int pageNo) throws IOException {
String url="https://www.lagou.com/wn/jobs";
HttpPost httpPost=new HttpPost(url);
httpPost.setHeader("origin","https://www.lagou.com/");
httpPost.setHeader("cookie","JSESSIONID=ABAAABAABAGABFAAEA69FC8E6CD8BB9279BD998C897601D; WEBTJ-ID=20230918164455-18aa776cf0f95c-068f79df6bc8cf-78505774-1327104-18aa776cf102dd; RECOMMEND_TIP=true; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695026697; user_trace_token=20230918164457-cfb25219-7bd9-4f7d-9dc5-bc171648a693; LGSID=20230918164457-4880bcac-8e67-486b-8bb3-6aee05cc4147; LGUID=20230918164457-ed1f5207-a2ee-421f-922a-d4fc89bf7739; _ga=GA1.2.1911198389.1695026697; _gid=GA1.2.302334427.1695026697; sajssdk_2015_cross_new_user=1; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; gate_login_token=v1####245d299e0d1ed714bf5e0c9709c8fab2553f894311014f16e954f5fdb12a0e64; LG_HAS_LOGIN=1; _putrc=81B878065C7103C6123F89F2B170EADC; login=true; hasDeliver=0; unick=%E5%88%98%E7%B4%AB%E9%94%A6; __SAFETY_CLOSE_TIME__26494154=1; TG-TRACK-CODE=index_navigation; __lg_stoken__=16e86fbfab228abce480d7c20e2a438c1a95f406ed5488e30fe83a2948fe63626ea8863aa1c6cec421d046c81340519584424cfd74494e3b9b45bd06928ed96691bf89e7027a; X_HTTP_TOKEN=5d8ce7eae99bedb73309205961ac07a8fe736a0c27; LGRID=20230918172353-055032eb-1cb1-48cc-a0d5-12a9700fcf93; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695029038; _ga_DDLTLJDLHH=GS1.2.1695026697.1.1.1695029038.60.0.0; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2226494154%22%2C%22first_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%22117.0.0.0%22%7D%2C%22%24device_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%7D");
httpPost.setHeader("referer","https://www.lagou.com/wn/jobs?kd=Java&city=%E5%8C%97%E4%BA%AC&pn=1");
httpPost.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31");
List<BasicNameValuePair> params=new ArrayList<>();
params.add(new BasicNameValuePair("first", "true"));
params.add(new BasicNameValuePair("pn", String.valueOf(pageNo)));
params.add(new BasicNameValuePair("kd", "Java"));
httpPost.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));
CloseableHttpClient httpClient= HttpClients.createDefault();
CloseableHttpResponse httpResponse= httpClient.execute(httpPost);
String result= EntityUtils.toString(httpResponse.getEntity(),StandardCharsets.UTF_8);
System.out.println("result: "+result);
return result;
}
}
3、编写程序保存到hdfs中
package org.example;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
public class HttpClientData {
public static String getInfoData(int pageNo) throws IOException {
String url="https://www.lagou.com/wn/jobs";
HttpPost httpPost=new HttpPost(url);
httpPost.setHeader("origin","https://www.lagou.com/");
httpPost.setHeader("cookie","JSESSIONID=ABAAABAABAGABFAAEA69FC8E6CD8BB9279BD998C897601D; WEBTJ-ID=20230918164455-18aa776cf0f95c-068f79df6bc8cf-78505774-1327104-18aa776cf102dd; RECOMMEND_TIP=true; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695026697; user_trace_token=20230918164457-cfb25219-7bd9-4f7d-9dc5-bc171648a693; LGSID=20230918164457-4880bcac-8e67-486b-8bb3-6aee05cc4147; LGUID=20230918164457-ed1f5207-a2ee-421f-922a-d4fc89bf7739; _ga=GA1.2.1911198389.1695026697; _gid=GA1.2.302334427.1695026697; sajssdk_2015_cross_new_user=1; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; gate_login_token=v1####245d299e0d1ed714bf5e0c9709c8fab2553f894311014f16e954f5fdb12a0e64; LG_HAS_LOGIN=1; _putrc=81B878065C7103C6123F89F2B170EADC; login=true; hasDeliver=0; unick=%E5%88%98%E7%B4%AB%E9%94%A6; __SAFETY_CLOSE_TIME__26494154=1; TG-TRACK-CODE=index_navigation; __lg_stoken__=16e86fbfab228abce480d7c20e2a438c1a95f406ed5488e30fe83a2948fe63626ea8863aa1c6cec421d046c81340519584424cfd74494e3b9b45bd06928ed96691bf89e7027a; X_HTTP_TOKEN=5d8ce7eae99bedb73309205961ac07a8fe736a0c27; LGRID=20230918172353-055032eb-1cb1-48cc-a0d5-12a9700fcf93; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695029038; _ga_DDLTLJDLHH=GS1.2.1695026697.1.1.1695029038.60.0.0; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2226494154%22%2C%22first_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%22117.0.0.0%22%7D%2C%22%24device_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%7D");
httpPost.setHeader("referer","https://www.lagou.com/wn/jobs?kd=Java&city=%E5%8C%97%E4%BA%AC&pn=1");
httpPost.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31");
List<BasicNameValuePair> params=new ArrayList<>();
params.add(new BasicNameValuePair("first", "true"));
params.add(new BasicNameValuePair("pn", String.valueOf(pageNo)));
params.add(new BasicNameValuePair("kd", "Java"));
httpPost.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));
CloseableHttpClient httpClient= HttpClients.createDefault();
CloseableHttpResponse httpResponse= httpClient.execute(httpPost);
String result= EntityUtils.toString(httpResponse.getEntity(),StandardCharsets.UTF_8);
System.out.println("result: "+result);
return result;
}
//将采集结果保存到HDFS中
public static void getInfoDataHdfs(String result) throws IOException, InterruptedException {
//1--创建hadoop的配置对象
Configuration configuration=new Configuration();
//2--通过配置对象获取HDFS文件系统对象
FileSystem fileSystem= FileSystem.get(URI.create("hdfs://node:9000"),configuration,"root");
//3--创建HDFS数据文件保存地址
Path filePath=new Path("/lagou/"+ LocalDate.now().format(DateTimeFormatter.ofPattern("yyyymmdd")));
//4--使用UUID生成文件名称
String fileName= UUID.randomUUID().toString().concat(".json");
//5--获取HDFS输出流
FSDataOutputStream fsDataOutputStream=fileSystem.create(new Path(filePath,fileName));
//6--使用HDFS IO工具将数据文件上传到HDFS的指定路径
IOUtils.copyBytes(new ByteArrayInputStream(result.getBytes(StandardCharsets.UTF_8)), fsDataOutputStream,configuration,true);
fileSystem.close();
System.out.println("保存成功~~");
}
}
Main方法:
package org.example;
import java.io.IOException;
public class Main {
public static void main(String[] args) throws IOException, InterruptedException {
for(int i=0;i<=30;i++){
String result=HttpClientData.getInfoData(i);
HttpClientData.getInfoDataHdfs(result);
Thread.sleep(1000);
}
}
}
标签:22%,3A%,采集,import,apache,org,HttpClient,new,页面
From: https://www.cnblogs.com/liuzijin/p/17712760.html