地名作为最常用的社会公共信息,不仅与人们的日常生活息息相关,而且是政府行政行为、经济建设不可缺少的基础信息资源。在政务系统中有许多需要将业务地址关联到标准地址的场景,addresstool致力于解决地址关联匹配算法中的速度和准确性问题。经实测,单核addresstool的地址关联速度在5000/秒-20000/秒之间(取决于业务地址质量),关联匹配正确率达到98%。hadoop分布式环境地址关联匹配速度能达到10万+/秒(具体取决与大数据节点数和地址质量,节点足够。每秒实现百万级地址关联)。
本文大数据环境为hive数据库,通过udf将addresstool封装,最后实现分布式计算。
直接上代码
public class AddressLink extends GenericUDF {
private PrimitiveObjectInspector addressIO;
private static AddressTool addressTool;
private String bld(String building){
if(building!=null&&!building.isEmpty() ){
if(building.endsWith("栋")||building.endsWith("幢")){
return building.substring(0,building.length()-1);
}else if(building.endsWith("号楼")){
return building.substring(0,building.length()-2);
}
}
return building;
}
private String unit(String unit){
if(unit!=null&&!unit.isEmpty() ){
if(unit.endsWith("单元")){
return unit.substring(0,unit.length()-2);
}
}
return unit;
}
private String room(String room){
if(room!=null&&!room.isEmpty() ){
if(room.endsWith("室")||room.endsWith("户")){
return room.substring(0,room.length()-1);
}
}
return room;
}
@Override
public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
if (arguments[0] instanceof ObjectInspector) {
addressIO = (PrimitiveObjectInspector) arguments[0];
}else{
throw new UDFArgumentLengthException("The function GetMapValue accepts 1 argument. simple: GetSqName(sq_name)");
}
addressTool = new AddressTool();
DataTable data = new DataTable();
try{
//注册Driver
String driver = "org.postgresql.Driver";//prop.getProperty("driver");
String url = "jdbc:postgresql://*****:5432/postgres";//prop.getProperty("url");
String username = "******";//prop.getProperty("user");
String password = "******";//prop.getProperty("password");
Class.forName(driver);
Connection connection = DriverManager.getConnection(url, username, password);
Statement statement = connection.createStatement();
// 数据初始化
ResultSet res = statement.executeQuery("select id,province,city,county,town,community,road,road_no,aoi,sub_aoi,building,unit,room,address from st_address order by aoi,road,road_no");
int cnt = 0;
while (res.next()) {
HashMap<String,String> mp = new HashMap<>();
if(res.getString("id")!=null&& !Objects.equals(res.getString("id"), "")){mp.put("id",res.getString("id"));}
if(res.getString("province")!=null&& !Objects.equals(res.getString("province"), "")){mp.put("province",res.getString("province"));}
if(res.getString("city")!=null&& !Objects.equals(res.getString("city"), "")){mp.put("city",res.getString("city"));}
if(res.getString("county")!=null&& !Objects.equals(res.getString("county"), "")){mp.put("county",res.getString("county"));}
if(res.getString("town")!=null&& !Objects.equals(res.getString("town"), "")){mp.put("town",res.getString("town"));}
if(res.getString("community")!=null&& !Objects.equals(res.getString("community"), "")){mp.put("community",res.getString("community"));}
if(res.getString("road")!=null&& !Objects.equals(res.getString("road"), "")){mp.put("road",res.getString("road"));}
if(res.getString("road_no")!=null&& !Objects.equals(res.getString("road_no"), "")){mp.put("road_no",res.getString("road_no"));}
if(res.getString("aoi")!=null&& !Objects.equals(res.getString("aoi"), "")){mp.put("aoi",res.getString("aoi"));}
if(res.getString("sub_aoi")!=null&& !Objects.equals(res.getString("sub_aoi"), "")){mp.put("sub_aoi",res.getString("sub_aoi"));}
if(res.getString("building")!=null&& !Objects.equals(res.getString("building"), "")){mp.put("building",bld(res.getString("building")));}
if(res.getString("unit")!=null&& !Objects.equals(res.getString("unit"), "")){mp.put("unit",unit(res.getString("unit")));}
if(res.getString("room")!=null&& !Objects.equals(res.getString("room"), "")){mp.put("room",room(res.getString("room")));}
if(res.getString("address")!=null&& !Objects.equals(res.getString("address"), "")){mp.put("address",res.getString("address"));}
data.addAddressDic(mp);
cnt = cnt + 1;
}
//标准数据地址数据加载到addresstool
data.initData(addressTool);
data = null;
statement.close();
connection.close();
} catch (Exception throwables) {
throwables.printStackTrace();
}
return ObjectInspectorFactory.getStandardMapObjectInspector(
PrimitiveObjectInspectorFactory.javaStringObjectInspector,
PrimitiveObjectInspectorFactory.javaStringObjectInspector);
}
@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
if(arguments[0].get()==null){
return null;
}
String address = PrimitiveObjectInspectorUtils.getString(arguments[0].get(), this.addressIO);
// 中文地址中的异常字符预处理
while(address.contains(" ")){address = address.replace(" ","");}
while(address.contains("--")){address = address.replace("--","-");}
while(address.contains("——")){address = address.replace("——","-");}
while(address.contains("- ")){address = address.replace("- ","-");}
while(address.contains(" -")){address = address.replace(" -","-");}
while(address.contains("— ")){address = address.replace("— ","-");}
while(address.contains(" —")){address = address.replace(" —","-");}
// 地址关联
StandardAddress stdAddress = addressTool.getStdAddress(address);
Map<String,String> result = stdAddress.getStdAddress();
// 地址级别判断
if(stdAddress.addressLevel!=null&& !stdAddress.addressLevel.equals("")){
result.put("addressLevel",stdAddress.addressLevel);
}else{
result.put("addressLevel","未知");
}
// 地址关联级别判断
if(stdAddress.linkLevel!=null&& !stdAddress.linkLevel.equals("")){
result.put("linkLevel",stdAddress.linkLevel);
}else{
result.put("linkLevel","未关联");
}
return result;
}
@Override
public String getDisplayString(String[] children) {
return "Address(" + children[0] + ")";
}
}
addresstool在分布式节点下计算速度超级快,经实测,1千万地址数据在sparksql调用udf方式,耗时3.5分钟。5千万数据耗时8分钟。
详细代码见git上AddressLink类
通过addresstool与大数据结合,成功实现每日千万级业务地址全量关联更新。
java资源下载
https://download.csdn.net/download/u011024436/89035851
源码学习
https://gitee.com/addresstool/address
使用中有问题或者建议,欢迎联系邮箱[email protected]
标签:10,getString,res,千万级,地址,&&,address,put,null From: https://blog.csdn.net/u011024436/article/details/137244190