我们从阿里云的数字可视化平台获取数据 http://datav.aliyun.com/tools/atlas
爬取的链接如下:
湖北省(不包含子区域):https://geo.datav.aliyun.com/areas_v3/bound/420000.json
湖北地级市(包含子区域):https://geo.datav.aliyun.com/areas_v3/bound/420100_full.json(武汉市为例)
湖北区/县:https://geo.datav.aliyun.com/areas_v3/bound/420111.json(武汉市洪山区为例)
我们通过获取所有省市区的地址来爬取不太现实,我们可以先从数据库中获取省市区县的area_code编码,从而可以得到上面省市区的地址。
我们使用webmagic框架来爬取,这里直接上代码:
爬取类
import com.alibaba.fastjson.JSON; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.webmagic.selector.Json; import java.math.BigDecimal; //WebMagic的结构分为Downloader(下载)、PageProcessor(解析处理)、Scheduler(管理URL并去重)、Pipeline(持久化)四大组件 public class LngLatProcessor implements PageProcessor { public void process(Page page) { //打印页面内容 Json json = page.getJson(); String s = json.get(); JSONObject jsonObject = JSON.parseObject(s); JSONArray features = jsonObject.getJSONArray("features"); int size = features.size(); for (int i = 0; i < size; i++) { JSONObject jsonObject1 = features.getJSONObject(i); JSONObject properties = jsonObject1.getJSONObject("properties"); JSONArray center = properties.getJSONArray("center"); String name = properties.getString("name"); Integer code = properties.getInteger("adcode"); BigDecimal longitude = (BigDecimal) center.get(0); BigDecimal latitude = (BigDecimal) center.get(1); page.putField("name", name); page.putField("code", code); page.putField("longitude", longitude.doubleValue()); page.putField("latitude", latitude.doubleValue()); } } public Site getSite() { return Site.me() .setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36 HBPC/12.1.2.300") .setSleepTime(1000) .setTimeOut(10000) .setRetryTimes(3); } }
定制pipeline输出
import com.ljxx.pts.dao.AreasMapper; import com.ljxx.pts.entity.Areas; import org.springframework.stereotype.Component; import us.codecraft.webmagic.ResultItems; import us.codecraft.webmagic.Task; import us.codecraft.webmagic.pipeline.Pipeline; import javax.annotation.Resource; import java.util.Date; import java.util.Map; // 定制pipeline输出 @Component public class MyPipeline implements Pipeline { @Resource private AreasMapper areasMapper; @Override public void process(ResultItems resultItems, Task task) { Areas areaCode = new Areas(); areaCode.setUpdateTime(new Date()); for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) { if ("code".equalsIgnoreCase(entry.getKey())) { Integer code = (Integer) entry.getValue(); System.out.println(code); areaCode.setId(code); } if ("latitude".equalsIgnoreCase(entry.getKey())) { Double latitude = (Double) entry.getValue(); System.out.println(latitude); areaCode.setLatitude(latitude); } if ("longitude".equalsIgnoreCase(entry.getKey())) { Double longitude = (Double) entry.getValue(); System.out.println(longitude); areaCode.setLongitude(longitude); } } // 在pipeline中将数据保存到数据库 int i = areasMapper.updateByPrimaryKeySelective(areaCode); } }
测试类
import com.ljxx.pts.dao.AreasMapper; import com.ljxx.pts.entity.Areas; import com.ljxx.pts.webmagic.LngLatProcessor; import com.ljxx.pts.webmagic.MyPipeline; import org.junit.Test; import org.junit.runner.RunWith; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import us.codecraft.webmagic.Spider; import javax.annotation.Resource; import java.util.List; @SpringBootTest @RunWith(SpringJUnit4ClassRunner.class) public class TestDemo { // @Autowired // private LngLatProcessor processor; @Autowired private MyPipeline myPipeline; @Resource private AreasMapper areasMapper; @Test public void test(){ // 先获取省的code,省市区的级别分别为1,2,3 Areas areas1 = new Areas(); areas1.setAreaLevel(3); List<Areas> areas = areasMapper.select(areas1); for (Areas area : areas) { Integer id = area.getId(); Spider.create( new LngLatProcessor()) .addUrl("https://geo.datav.aliyun.com/areas_v3/bound/"+id+".json") .addPipeline(myPipeline) .thread(5) .run(); } } }
标签:codecraft,code,爬虫,us,爬取,import,省市区,com,webmagic From: https://www.cnblogs.com/zwh0910/p/18212400