首页 > 其他分享 >HttpClient采集页面数据

HttpClient采集页面数据

时间:2023-09-18 19:47:44浏览次数:47  
标签:22% 3A% 采集 import apache org HttpClient new 页面

1、导入相关依赖

<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-client</artifactId>
    <version>3.3.0</version>
</dependency>

<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>4.5.13</version>
</dependency>

2、编写代码程序输出采集结果

package org.example;

import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

public class HttpClientData {

    public static String getInfoData(int pageNo) throws IOException {
        String url="https://www.lagou.com/wn/jobs";

        HttpPost httpPost=new HttpPost(url);

        httpPost.setHeader("origin","https://www.lagou.com/");
        httpPost.setHeader("cookie","JSESSIONID=ABAAABAABAGABFAAEA69FC8E6CD8BB9279BD998C897601D; WEBTJ-ID=20230918164455-18aa776cf0f95c-068f79df6bc8cf-78505774-1327104-18aa776cf102dd; RECOMMEND_TIP=true; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695026697; user_trace_token=20230918164457-cfb25219-7bd9-4f7d-9dc5-bc171648a693; LGSID=20230918164457-4880bcac-8e67-486b-8bb3-6aee05cc4147; LGUID=20230918164457-ed1f5207-a2ee-421f-922a-d4fc89bf7739; _ga=GA1.2.1911198389.1695026697; _gid=GA1.2.302334427.1695026697; sajssdk_2015_cross_new_user=1; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; gate_login_token=v1####245d299e0d1ed714bf5e0c9709c8fab2553f894311014f16e954f5fdb12a0e64; LG_HAS_LOGIN=1; _putrc=81B878065C7103C6123F89F2B170EADC; login=true; hasDeliver=0; unick=%E5%88%98%E7%B4%AB%E9%94%A6; __SAFETY_CLOSE_TIME__26494154=1; TG-TRACK-CODE=index_navigation; __lg_stoken__=16e86fbfab228abce480d7c20e2a438c1a95f406ed5488e30fe83a2948fe63626ea8863aa1c6cec421d046c81340519584424cfd74494e3b9b45bd06928ed96691bf89e7027a; X_HTTP_TOKEN=5d8ce7eae99bedb73309205961ac07a8fe736a0c27; LGRID=20230918172353-055032eb-1cb1-48cc-a0d5-12a9700fcf93; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695029038; _ga_DDLTLJDLHH=GS1.2.1695026697.1.1.1695029038.60.0.0; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2226494154%22%2C%22first_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%22117.0.0.0%22%7D%2C%22%24device_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%7D");
        httpPost.setHeader("referer","https://www.lagou.com/wn/jobs?kd=Java&city=%E5%8C%97%E4%BA%AC&pn=1");
        httpPost.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31");

        List<BasicNameValuePair> params=new ArrayList<>();

        params.add(new BasicNameValuePair("first", "true"));
        params.add(new BasicNameValuePair("pn", String.valueOf(pageNo)));
        params.add(new BasicNameValuePair("kd", "Java"));

        httpPost.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));

        CloseableHttpClient httpClient= HttpClients.createDefault();
        CloseableHttpResponse httpResponse= httpClient.execute(httpPost);

        String result= EntityUtils.toString(httpResponse.getEntity(),StandardCharsets.UTF_8);

        System.out.println("result:  "+result);

        return result;


    }
}

3、编写程序保存到hdfs中

package org.example;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;

public class HttpClientData {

    public static String getInfoData(int pageNo) throws IOException {
        String url="https://www.lagou.com/wn/jobs";

        HttpPost httpPost=new HttpPost(url);

        httpPost.setHeader("origin","https://www.lagou.com/");
        httpPost.setHeader("cookie","JSESSIONID=ABAAABAABAGABFAAEA69FC8E6CD8BB9279BD998C897601D; WEBTJ-ID=20230918164455-18aa776cf0f95c-068f79df6bc8cf-78505774-1327104-18aa776cf102dd; RECOMMEND_TIP=true; privacyPolicyPopup=false; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695026697; user_trace_token=20230918164457-cfb25219-7bd9-4f7d-9dc5-bc171648a693; LGSID=20230918164457-4880bcac-8e67-486b-8bb3-6aee05cc4147; LGUID=20230918164457-ed1f5207-a2ee-421f-922a-d4fc89bf7739; _ga=GA1.2.1911198389.1695026697; _gid=GA1.2.302334427.1695026697; sajssdk_2015_cross_new_user=1; sensorsdata2015session=%7B%7D; index_location_city=%E5%85%A8%E5%9B%BD; gate_login_token=v1####245d299e0d1ed714bf5e0c9709c8fab2553f894311014f16e954f5fdb12a0e64; LG_HAS_LOGIN=1; _putrc=81B878065C7103C6123F89F2B170EADC; login=true; hasDeliver=0; unick=%E5%88%98%E7%B4%AB%E9%94%A6; __SAFETY_CLOSE_TIME__26494154=1; TG-TRACK-CODE=index_navigation; __lg_stoken__=16e86fbfab228abce480d7c20e2a438c1a95f406ed5488e30fe83a2948fe63626ea8863aa1c6cec421d046c81340519584424cfd74494e3b9b45bd06928ed96691bf89e7027a; X_HTTP_TOKEN=5d8ce7eae99bedb73309205961ac07a8fe736a0c27; LGRID=20230918172353-055032eb-1cb1-48cc-a0d5-12a9700fcf93; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1695029038; _ga_DDLTLJDLHH=GS1.2.1695026697.1.1.1695029038.60.0.0; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2226494154%22%2C%22first_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24os%22%3A%22Windows%22%2C%22%24browser%22%3A%22Chrome%22%2C%22%24browser_version%22%3A%22117.0.0.0%22%7D%2C%22%24device_id%22%3A%2218aa776d37b331-02514a8c212ba9-78505774-1327104-18aa776d37c19af%22%7D");
        httpPost.setHeader("referer","https://www.lagou.com/wn/jobs?kd=Java&city=%E5%8C%97%E4%BA%AC&pn=1");
        httpPost.setHeader("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31");

        List<BasicNameValuePair> params=new ArrayList<>();

        params.add(new BasicNameValuePair("first", "true"));
        params.add(new BasicNameValuePair("pn", String.valueOf(pageNo)));
        params.add(new BasicNameValuePair("kd", "Java"));

        httpPost.setEntity(new UrlEncodedFormEntity(params, StandardCharsets.UTF_8));

        CloseableHttpClient httpClient= HttpClients.createDefault();
        CloseableHttpResponse httpResponse= httpClient.execute(httpPost);

        String result= EntityUtils.toString(httpResponse.getEntity(),StandardCharsets.UTF_8);

        System.out.println("result:  "+result);

        return result;


    }


    //将采集结果保存到HDFS中
    public static void getInfoDataHdfs(String result) throws IOException, InterruptedException {

        //1--创建hadoop的配置对象
        Configuration configuration=new Configuration();

        //2--通过配置对象获取HDFS文件系统对象
        FileSystem fileSystem= FileSystem.get(URI.create("hdfs://node:9000"),configuration,"root");

        //3--创建HDFS数据文件保存地址
        Path filePath=new Path("/lagou/"+ LocalDate.now().format(DateTimeFormatter.ofPattern("yyyymmdd")));

        //4--使用UUID生成文件名称
        String fileName= UUID.randomUUID().toString().concat(".json");

        //5--获取HDFS输出流
        FSDataOutputStream fsDataOutputStream=fileSystem.create(new Path(filePath,fileName));

        //6--使用HDFS IO工具将数据文件上传到HDFS的指定路径
        IOUtils.copyBytes(new ByteArrayInputStream(result.getBytes(StandardCharsets.UTF_8)), fsDataOutputStream,configuration,true);

        fileSystem.close();

        System.out.println("保存成功~~");
    }

}

Main方法:

package org.example;

import java.io.IOException;

public class Main {
    public static void main(String[] args) throws IOException, InterruptedException {
        for(int i=0;i<=30;i++){
            String result=HttpClientData.getInfoData(i);

            HttpClientData.getInfoDataHdfs(result);

            Thread.sleep(1000);
        }
    }
}

标签:22%,3A%,采集,import,apache,org,HttpClient,new,页面
From: https://www.cnblogs.com/liuzijin/p/17712760.html

相关文章

  • Python实现数据采集
    前提是配置好hadoop的相关环境1、分析网页,确定采集的数据我们需要获取到该网页的如下几个信息:请求信息:url——网站页面地址设置这个请求的请求头:headers——(user-agent/referer/origin/cookie)设置这个请求的传递数据:data——(first/pn/kd)------>解决编码如下图所示:先安装......
  • 2023-09-18 hexo博客之如何自定义页面内容宽度==》在custom.styl中添加两行代码即可
    前言:我的hexo主题为hexo-theme-next 5.1.4版本。操作如下:打开你的博客名称\themes\hexo-theme-next\source\css\_variables,找到这个文件custom.styl,然后把下面代码添加进去:$main-desktop=1200px$content-desktop=1000px刷新页面即可见效。......
  • 如何使用谷歌搜索的时候,不是从当前页面而是从新页面打开链接?
    参考链接:https://support.google.com/chrome/thread/3520860/how-do-i-set-chrome-to-open-links-in-a-new-tab-on-the-same-browser-window?hl=en1.使用ctrl+左键点击链接2.在Google主界面进行更改进入主界面https://www.google.com/webhp,点击下方的设置选择其中的搜索设置......
  • html前端页面多规格商品sku选择
    <style>body{background-color:palegoldenrod;position:relative;}footer{border:1pxsolidred;height:50px;position:fixed;bottom:0;left:0;width:100%;}.btn{padding:015px;height:35px;line-hei......
  • 当页面中文本不允许选择时,使文本框中文本可以选择的js代码
    <bodyonselectstart="returnoSelect(event.srcElement);"><scriptlanguage="javascript">functionoSelect(obj){if(obj.type!='text')returnfalse;}</script><inputtype="text"name=&quo......
  • 按下按钮后页面的滚动条向下(或向上)滑动,松开之后便停留在当前位置的效果如何实现?
    网友问题?客户要求做一个框架,按下小框架页面中的“上翻”或者“下翻”按钮后,大框架页面便随之向上(或向下)滚动,松开按钮滚动停止,并停留在当前位置,如何解决?解决方案如下:-----------------mm.htm:-----------------<framesetrows="*,20%"><framename="main"src='liu.htm'target="fo......
  • 页面输出太多会严重影响web程序的性能
    我有这样一个小程序:asp+sqlserver2000。数据量增加的很快,最近发现它的性能非常差,每次打开都需要十几秒,甚至几十秒,因为我的程序分页用的是我自己的分页程序:难道这个分页程序有问题,但是其他地方用到它没有感觉到慢呀,我没事就琢磨他,到网上查资料,结果没有具体查到说到我......
  • TienChin 渠道管理-渠道页面完善
    最后附上渠道管理的数据installSQL语句:INSERTINTOTienChin.tienchin_channel(channel_id,channel_name,status,remark,type,create_by,update_by,create_time,update_time,del_flag)VALUES(3,'小红书渠道',1,'小红书渠道',1,'qudao','qu......
  • JAVA 实现登录页面生成验证码
    1importjavax.swing.*;2importjava.awt.Color;3importjava.awt.Font;4importjava.awt.Graphics;5importjava.awt.event.ActionEvent;6importjava.awt.event.ActionListener;7importjava.util.Random;89publicclassLoginFrameext......
  • Java swing 实现QQ登录注册页面
    代码如下1packagecom.lty;2importjavax.swing.*;3importjava.awt.*;4importjava.awt.event.ActionEvent;5importjava.awt.event.ActionListener;6importjava.util.Random;78publicclassQQLoginPage{9privateJFrameframe;10......