使用网络爬虫爬取省市区县的经纬度

标签：codecraft code 爬虫 us 爬取 import 省市区 com webmagic

我们从阿里云的数字可视化平台获取数据 http://datav.aliyun.com/tools/atlas

爬取的链接如下：

湖北省（不包含子区域）：https://geo.datav.aliyun.com/areas_v3/bound/420000.json

湖北地级市（包含子区域）：https://geo.datav.aliyun.com/areas_v3/bound/420100_full.json（武汉市为例）

湖北区/县：https://geo.datav.aliyun.com/areas_v3/bound/420111.json（武汉市洪山区为例）

我们通过获取所有省市区的地址来爬取不太现实，我们可以先从数据库中获取省市区县的area_code编码，从而可以得到上面省市区的地址。

我们使用webmagic框架来爬取，这里直接上代码：

爬取类

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Json;

import java.math.BigDecimal;

//WebMagic的结构分为Downloader（下载）、PageProcessor（解析处理）、Scheduler（管理URL并去重）、Pipeline（持久化）四大组件
public class LngLatProcessor implements PageProcessor {
    public void process(Page page) {
        //打印页面内容
        Json json = page.getJson();
        String s = json.get();
        JSONObject jsonObject = JSON.parseObject(s);
        JSONArray features = jsonObject.getJSONArray("features");
        int size = features.size();
        for (int i = 0; i < size; i++) {
            JSONObject jsonObject1 = features.getJSONObject(i);
            JSONObject properties = jsonObject1.getJSONObject("properties");
            JSONArray center = properties.getJSONArray("center");
            String name = properties.getString("name");
            Integer code = properties.getInteger("adcode");
            BigDecimal longitude = (BigDecimal) center.get(0);
            BigDecimal latitude = (BigDecimal) center.get(1);
            page.putField("name", name);

            page.putField("code", code);
            page.putField("longitude", longitude.doubleValue());
            page.putField("latitude", latitude.doubleValue());
        }
    }
    public Site getSite() {
        return Site.me()
                .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.2.300")
                .setSleepTime(1000)
                .setTimeOut(10000)
                .setRetryTimes(3);
    }
}

定制pipeline输出

import com.ljxx.pts.dao.AreasMapper;
import com.ljxx.pts.entity.Areas;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import javax.annotation.Resource;
import java.util.Date;
import java.util.Map;

// 定制pipeline输出
@Component
public class MyPipeline implements Pipeline {
    @Resource
    private AreasMapper areasMapper;
    @Override
    public void process(ResultItems resultItems, Task task) {
        Areas areaCode = new Areas();
        areaCode.setUpdateTime(new Date());
        for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
            if ("code".equalsIgnoreCase(entry.getKey())) {
                Integer code = (Integer) entry.getValue();
                System.out.println(code);
                areaCode.setId(code);
            }
            if ("latitude".equalsIgnoreCase(entry.getKey())) {
                Double latitude = (Double) entry.getValue();
                System.out.println(latitude);
                areaCode.setLatitude(latitude);
            }
            if ("longitude".equalsIgnoreCase(entry.getKey())) {
                Double longitude = (Double) entry.getValue();
                System.out.println(longitude);
                areaCode.setLongitude(longitude);
            }
        }
        // 在pipeline中将数据保存到数据库
        int i = areasMapper.updateByPrimaryKeySelective(areaCode);

    }
}

测试类

import com.ljxx.pts.dao.AreasMapper;
import com.ljxx.pts.entity.Areas;
import com.ljxx.pts.webmagic.LngLatProcessor;
import com.ljxx.pts.webmagic.MyPipeline;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import us.codecraft.webmagic.Spider;

import javax.annotation.Resource;
import java.util.List;

@SpringBootTest
@RunWith(SpringJUnit4ClassRunner.class)
public class TestDemo {

//    @Autowired
//    private LngLatProcessor processor;
    @Autowired
    private MyPipeline myPipeline;
    @Resource
    private AreasMapper areasMapper;
    @Test
    public void test(){
        // 先获取省的code,省市区的级别分别为1，2，3
        Areas areas1 = new Areas();
        areas1.setAreaLevel(3);
        List<Areas> areas = areasMapper.select(areas1);
        for (Areas area : areas) {
            Integer id = area.getId();
            Spider.create( new LngLatProcessor())
                    .addUrl("https://geo.datav.aliyun.com/areas_v3/bound/"+id+".json")
                    .addPipeline(myPipeline)
                    .thread(5)
                    .run();
        }

    }

}

标签：codecraft,code,爬虫,us,爬取,import,省市区,com,webmagic
From： https://www.cnblogs.com/zwh0910/p/18212400

使用网络爬虫爬取省市区县的经纬度

相关文章

赞助商

阅读排行