首页 > 其他分享 >微博爬虫

微博爬虫

时间:2022-10-20 17:47:24浏览次数:61  
标签:fastjson get -- JSONObject 爬虫 alibaba 微博 com

   void getUser(){
        Map<String, Integer> map = new HashMap<>();
        HashMap<String, String> headers = new HashMap<>();
        headers.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0");
        headers.put("Cookie","SUB=_2A25OBB9gDeRhGeNG4lIZ8ybPyziIHXVtBqEorDV8PUJbkNAKLU7jkW1NSxZ3rk-lltjdVTJWAuJrtG1N-6THcXTY; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF0Mxd1XQQmx6HGO.e6BGsD5NHD95Qf1h.71heRe05XWs4Dqcj.i--fi-z7iKysi--RiKyWi-zpi--ci-2XiK.Ei--fiK.Ei-24; XSRF-TOKEN=ZxMS7NEE0TvXaeGc55l4CcWl; _s_tentry=weibo.com; Apache=2654538524812.041.1660972627006; SINAGLOBAL=2654538524812.041.1660972627006; ULV=1660972627078:1:1:1:2654538524812.041.1660972627006:; SSOLoginState=1660972849; WBPSESS=7pJQxz1_dPdMSL7AXnCXDf0T9olQ0YfW5LtecSt6SMnNs6oSz17JJhurTo7Zik1em1LCWoqCVL9m0scGsEictmrfwhqolExW-PYkh6TLS9C7vnatXY6ZBWEQsnj0vwcgKwilI1AKAgAxRGvUsmNr0w==");
        String s = HttpClientUtil.doGet("https://weibo.com/ajax/feed/unreadfriendstimeline?list_id=100015890838304&refresh=4&max_id=1661129223609925&count=15", null, headers);
       com.alibaba.fastjson.JSONObject jsonObject = com.alibaba.fastjson.JSONObject.parseObject(s);
       JSONArray statuses = jsonObject.getJSONArray("statuses");
       Object o = statuses.get(0);
       String s1 = com.alibaba.fastjson.JSONObject.toJSONString(o);
       com.alibaba.fastjson.JSONObject jsonObject1 = com.alibaba.fastjson.JSONObject.parseObject(s1);
       com.alibaba.fastjson.JSONObject user = jsonObject1.getJSONObject("user");
       Object id = user.get("id");
       HashMap<String, String> headers1 = new HashMap<>();
       headers1.put("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0");
       headers1.put("Cookie","SUB=_2A25OBB9gDeRhGeNG4lIZ8ybPyziIHXVtBqEorDV8PUJbkNAKLU7jkW1NSxZ3rk-lltjdVTJWAuJrtG1N-6THcXTY; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF0Mxd1XQQmx6HGO.e6BGsD5NHD95Qf1h.71heRe05XWs4Dqcj.i--fi-z7iKysi--RiKyWi-zpi--ci-2XiK.Ei--fiK.Ei-24; XSRF-TOKEN=ZxMS7NEE0TvXaeGc55l4CcWl; _s_tentry=weibo.com; Apache=2654538524812.041.1660972627006; SINAGLOBAL=2654538524812.041.1660972627006; ULV=1660972627078:1:1:1:2654538524812.041.1660972627006:; SSOLoginState=1660972849; WBPSESS=7pJQxz1_dPdMSL7AXnCXDf0T9olQ0YfW5LtecSt6SMnNs6oSz17JJhurTo7Zik1eGqyoXUKpM2W41KIDCDkOtBuCGPNyE1P0pDIpMdWKlnnTcm5T7J1kiGnlPV4m-csfCRzAfpmH0TG_aa3UuId66w==");
       String format = String.format("https://weibo.com/ajax/profile/info?uid=%s",id);
       String s2 = HttpClientUtil.doGet(format, null, headers1);
       com.alibaba.fastjson.JSONObject jsonObject2 = com.alibaba.fastjson.JSONObject.parseObject(s2);
       com.alibaba.fastjson.JSONObject data = jsonObject2.getJSONObject("data");
       com.alibaba.fastjson.JSONObject user2 = data.getJSONObject("user");
       Object followers_count = user2.get("followers_count");
       String screen_name = user2.getString("screen_name");
       String statuses_count = user2.getString("statuses_count");
       System.out.println("博主名:"+screen_name);
       System.out.println("博主id:"+id);
       System.out.println("粉丝数:"+followers_count);
       System.out.println("帖子数:"+statuses_count);
       HashMap<String, String> headers2 = new HashMap<>();
       headers2.put("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:103.0) Gecko/20100101 Firefox/103.0");
       headers2.put("Cookie","SUB=_2A25OBB9gDeRhGeNG4lIZ8ybPyziIHXVtBqEorDV8PUJbkNAKLU7jkW1NSxZ3rk-lltjdVTJWAuJrtG1N-6THcXTY; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WF0Mxd1XQQmx6HGO.e6BGsD5NHD95Qf1h.71heRe05XWs4Dqcj.i--fi-z7iKysi--RiKyWi-zpi--ci-2XiK.Ei--fiK.Ei-24; XSRF-TOKEN=ZxMS7NEE0TvXaeGc55l4CcWl; _s_tentry=weibo.com; Apache=2654538524812.041.1660972627006; SINAGLOBAL=2654538524812.041.1660972627006; ULV=1660972627078:1:1:1:2654538524812.041.1660972627006:; SSOLoginState=1660972849; WBPSESS=7pJQxz1_dPdMSL7AXnCXDf0T9olQ0YfW5LtecSt6SMnNs6oSz17JJhurTo7Zik1em1LCWoqCVL9m0scGsEictocaVoXXzdb7nwtc_2ZHJITDXVSMz_k5pmHQVvumdbsK9jmQ66-yF2eZaVgy5IUw8w==");
       Integer integer = Integer.valueOf(statuses_count);
       double page = integer/20;
       Double ceil1 = Math.ceil(page);
       int ceil = ceil1.intValue();
//               int ceil = 10;
       for (int i = 1; i <= ceil; i++) {
           String format1 = String.format("https://weibo.com/ajax/statuses/mymblog?uid=%s&page=%s",id,i);
           String s3 = HttpClientUtil.doGet(format1, null, headers2);
           com.alibaba.fastjson.JSONObject jsonObject3 = com.alibaba.fastjson.JSONObject.parseObject(s3);
           com.alibaba.fastjson.JSONObject data1 = jsonObject3.getJSONObject("data");
           JSONArray list = data1.getJSONArray("list");
           for (Object o1 : list) {
               String s4 = com.alibaba.fastjson.JSONObject.toJSONString(o1);
               com.alibaba.fastjson.JSONObject jsonObject4 = com.alibaba.fastjson.JSONObject.parseObject(s4);
                   String text_raw = jsonObject4.getString("text_raw");
                   boolean contains = text_raw.contains("@");
                   if(contains){
                       //("(?<=@).*?(?=' ')");
                       String regex= "(?<=@)(.*?)(?= )";
                       Pattern pattern = Pattern.compile(regex);
                       Matcher matcher = pattern.matcher(text_raw);
                       while (matcher.find()){
    //                       System.out.println(matcher.group(1));
                           String group = matcher.group();
                           if (map.containsKey(group)){
                               Integer value = map.get(group);
    //                           value++;
                               int l = value + 1;
                               map.put(group,l);
                           }else {
                               map.put(group,1);
                           }
                           //System.out.println(map);
                       }
                   }
               }
//                               try {
//                                   Thread.sleep(200);
//                               } catch (InterruptedException e) {
//                                   e.printStackTrace();
//                               }

       }
       List<Map<String, Integer>> mapArrayList = new ArrayList<>();
       mapArrayList.add(map);
       // 定义一个新的工作簿
       XSSFWorkbook wb = new XSSFWorkbook();
       // 创建一个Sheet页
       XSSFSheet sheet = wb.createSheet("First sheet");
       //设置行高
       sheet.setDefaultRowHeight((short) (2 * 256));
       //设置列宽
       sheet.setColumnWidth(0, 4000);
       sheet.setColumnWidth(1, 4000);
       XSSFFont font = wb.createFont();
       font.setFontName("宋体");
       font.setFontHeightInPoints((short) 16);
       //获得表格第一行
       XSSFRow row = sheet.createRow(0);
       //根据需要给第一行每一列设置标题
       XSSFCell cell = row.createCell(0);
       cell.setCellValue("姓名");
       cell = row.createCell(1);
       cell.setCellValue("个数");
       XSSFRow rows;
       XSSFCell cells;
       System.out.println(mapArrayList);
       //循环拿到的数据给所有行每一列设置对应的值
       for (int i = 0; i < mapArrayList.size(); i++) {
           // 在这个sheet页里创建一行
//           rows = sheet.createRow(i + 1);
           // 该行创建一个单元格,在该单元格里设置值
           List <String> name = new ArrayList<>(mapArrayList.get(i).keySet());
//           int size = mapArrayList.get(i).size();
           int j = 0;
           for(String obj : name){
               rows = sheet.createRow(j + 1);
               cells = rows.createCell(i);
               cells.setCellValue(obj);
               cells = rows.createCell(1);
               cells.setCellValue(mapArrayList.get(i).get(obj));
               j++;
           }
//           Integer age = mapArrayList.get(i).get("个数");
       }
       try {
           //D:\Ban
           String src = "D:/Ban/a.xls";
           File file = new File(src);
           FileOutputStream fileOutputStream = new FileOutputStream(file);
           wb.write(fileOutputStream);
           wb.close();
           fileOutputStream.close();
       } catch (IOException e) {
           e.printStackTrace();
       }
   }

 

标签:fastjson,get,--,JSONObject,爬虫,alibaba,微博,com
From: https://www.cnblogs.com/123sougou/p/16810713.html

相关文章

  • java爬虫之HtmlUnit介绍
    前端有时候会遇到项目临时需要网上收集数据的情况,什么方案是简单易懂、长期可用的呢,当然是用浏览器终端测试单元做爬虫是最方便的啦,将平时工作中的测试程序进行简单的修改......
  • Jsoup爬虫的简单使用
    添加POM依赖<dependency><groupId>org.jsoup</groupId><artifactId>jsoup</artifactId><version>1.7.3</version></dependency>JAVA代码示例pub......
  • 爬虫技术可以分析数据吗?
    目前在不少大数据团队中,数据分析和数据挖掘工程师通常都有明确的分工,数据采集往往并不是数据分析和挖掘工程师的任务,通常做爬虫的是大数据应用开发程序员或者是数据采集工程......
  • 14 scrapy的crawlspider爬虫
    scrapy的crawlspider爬虫学习目标:了解crawlspider的作用应用crawlspider爬虫创建的方法应用crawlspider中rules的使用1crawlspider是什么回顾之前的代码中,我......
  • 爬虫概述
    爬虫概述知识点:了解爬虫的概念了解爬虫的作用了解爬虫的分类掌握爬虫的流程1.爬虫的概念模拟浏览器,发送请求,获取响应网络爬虫(又被称为网页蜘蛛,......
  • 【转】如何利用Python爬虫爬取网页中图片(成功实现自动翻页至最后一页)
    【原文】https://blog.csdn.net/weixin_65423581/article/details/1225336461.模块的使用(1).random模块:主要是为了产生随机数作为写入jpg的名称(这里其实可以用字......
  • 爬虫-1.概述和HTTP请求与响应处理
    爬虫-1.概述和HTTP请求与响应处理概述爬虫,应该称为网络爬虫,也叫网页蜘蛛、网络机器人、网络蚂蚁等。搜索引擎,就是网络爬虫的应用者。大数据时代的到来,所有企业都希望通过海......
  • [原创]一款基于Reactor线程模型的java网络爬虫框架
    AJSpridergithub:​​https://github.com/zhuchangwu/AJSpider​​概述AJSprider是笔者基于Reactor线程模式+Jsoup+HttpClient封装的一款轻量级java多线程网络爬虫框架,简......
  • python 网络爬虫全流程教学,从入门到实战(requests+bs4+存储文件)
    python网络爬虫全流程教学,从入门到实战(requests+bs4+存储文件)requests是一个Python第三方库,用于向URL地址发起请求bs4全名BeautifulSoup4,是编写python爬虫常用库之......
  • Python爬虫(学习笔记)
    Python爬虫(学习笔记)  常见的反爬机制及应对策略名称描述解决方案/反反爬措施1.Headers 从用户的headers进行反爬是最常见的反爬策略,Headers是......